merged ocl module from ocl branch (it's not quite usable yet; at least on Mac)

2012-07-16 17:08:14 +00:00 · 2012-07-16 17:08:14 +00:00 · 9f4efab40a
commit 9f4efab40a
parent cfeb28f2ba
120 changed files with 59724 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -139,6 +139,7 @@ OCV_OPTION(WITH_VIDEOINPUT     "Build HighGUI with DirectShow support"       ON
 OCV_OPTION(WITH_XIMEA          "Include XIMEA cameras support"               OFF  IF (NOT ANDROID AND NOT APPLE) )
 OCV_OPTION(WITH_XINE           "Include Xine support (GPL)"                  OFF  IF (UNIX AND NOT APPLE AND NOT ANDROID) )
 OCV_OPTION(WITH_CLP            "Include Clp support (EPL)"                   OFF)
+OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              OFF  IF (NOT ANDROID AND NOT IOS) )

 # OpenCV build components
 # ===================================================
@ -389,6 +390,13 @@ else()
  SET(CAN_BUILD_ANDROID_PROJECTS FALSE)
 endif()

+# --- OpenCL ---
+if(WITH_OPENCL)
+  include(cmake/OpenCVDetectOpenCL.cmake REQUIRED)
+  if(OPENCL_FOUND)
+    set(HAVE_OPENCL 1)
+  endif()
+endif()

 # ----------------------------------------------------------------------------
 # Solution folders:
@ -715,6 +723,8 @@ if(DEFINED WITH_CUDA)
  status("    Use Cuda:"  HAVE_CUDA  THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO)
 endif(DEFINED WITH_CUDA)

+status("    Use OpenCL:"  HAVE_OPENCL  THEN YES ELSE NO)
+
 status("    Use Eigen:" HAVE_EIGEN THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
 status("    Use Clp:"   HAVE_CLP   THEN YES ELSE NO)

--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@ -0,0 +1,64 @@
+if(APPLE)
+    set(OPENCL_FOUND YES)
+    set(OPENCL_LIBRARIES "-framework OpenCL")
+else()
+    find_package(OpenCL)
+
+    # Try AMD/ATI Stream SDK
+    if (NOT OPENCL_FOUND)
+        set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT})
+        set(ENV_OPENCLROOT $ENV{OPENCLROOT})
+        set(ENV_CUDA_PATH $ENV{CUDA_PATH})
+        if(ENV_AMDSTREAMSDKROOT)
+            set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include)
+            if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+                set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86)
+            else()
+                set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64)
+            endif()
+        elif(ENV_CUDAPATH AND WIN32)
+            set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include)
+            if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+                set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32)
+            else()
+                set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64)
+            endif()
+        elif(ENV_OPENCLROOT AND UNIX)
+            set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc)
+            if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+                set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib)
+            else()
+                set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64)
+            endif()
+        endif()
+    
+        if(OPENCL_INCLUDE_SEARCH_PATH)
+            find_path(OPENCL_INCLUDE_DIR
+                NAMES CL/cl.h OpenCL/cl.h
+                PATHS ${OPENCL_INCLUDE_SEARCH_PATH}
+                NO_DEFAULT_PATH)
+        else()
+            find_path(OPENCL_INCLUDE_DIR
+                NAMES CL/cl.h OpenCL/cl.h)
+        endif()
+    
+        if(OPENCL_LIB_SEARCH_PATH)
+            find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH)
+        else()
+            find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH)
+        endif()
+
+        include(FindPackageHandleStandardArgs)
+        find_package_handle_standard_args(
+          OPENCL
+          DEFAULT_MSG
+          OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+          )
+
+        if(OPENCL_FOUND)
+            set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
+        else()
+            set(OPENCL_LIBRARIES)
+        endif()
+    endif()
+endif()
--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@ -172,6 +172,9 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA

+/* OpenCL Support */
+#cmakedefine HAVE_OPENCL
+
 /* NVidia Cuda Fast Fourier Transform (FFT) API*/
 #cmakedefine HAVE_CUFFT

--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@ -0,0 +1,63 @@
+# Will be modified later 
+if(NOT HAVE_OPENCL)
+  ocv_module_disable(ocl)
+endif()
+
+set(the_description "OpenCL-accelerated Computer Vision")
+ocv_add_module(ocl opencv_core opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
+
+ocv_module_include_directories()
+
+file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
+set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.py")
+
+add_custom_command(
+   OUTPUT ${kernels_cpp}
+   COMMAND ${PYTHON_EXECUTABLE} ${cl2cpp_script} "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" ${kernels_cpp}
+   DEPENDS ${CL_FILES} ${cl2cpp_script})
+
+file(GLOB lib_hdrs     "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
+file(GLOB lib_srcs     "src/*.cpp")
+file(GLOB lib_int_hdrs "src/*.h*")
+
+source_group("Include"   FILES ${lib_hdrs})
+source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
+
+if (HAVE_OPENCL)
+  set(ocl_link_libs ${OPENCL_LIBRARIES})
+  if(OPENCL_INCLUDE_DIR)
+      ocv_include_directories(${OPENCL_INCLUDE_DIR})
+  endif()
+endif()
+
+ocv_set_module_sources(
+    HEADERS ${lib_hdrs}
+    SOURCES ${lib_int_hdrs} ${lib_srcs}
+    )
+    
+set(OPENCV_MODULE_opencv_ocl_SOURCES ${OPENCV_MODULE_opencv_ocl_SOURCES} ${kernels_cpp})
+ocv_create_module(${ocl_link_libs})
+
+install(FILES ${lib_hdrs}
+    DESTINATION include/opencv2/${name}
+    COMPONENT main)
+
+ocv_add_precompiled_headers(${the_module})
+
+################################################################################################################
+################################      OpenCL Module Tests     ##################################################
+################################################################################################################
+file(GLOB test_srcs "test/*.cpp")
+file(GLOB test_hdrs "test/*.hpp" "test/*.h")
+
+ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
+                       FILES "Src" ${test_srcs})
+################################################################################################################
+################################   OpenCL Module Performance  ##################################################
+################################################################################################################
+#file(GLOB perf_srcs "perf/*.cpp")
+#file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
+
+#ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
+#                       FILES "Src" ${perf_srcs})
--- a/modules/ocl/cl2cpp.py
+++ b/modules/ocl/cl2cpp.py
@ -0,0 +1,55 @@
+import os, os.path, sys, glob
+
+indir = sys.argv[1]
+outname = sys.argv[2]
+#indir = "/Users/vp/work/ocv/opencv/modules/ocl/src/kernels"
+#outname = "/Users/vp/work/ocv.build/xcode/modules/ocl/kernels.cpp"
+
+try:
+    os.mkdir(os.path.dirname(outname))
+except OSError:
+    pass
+
+cl_list = glob.glob(os.path.join(indir, "*.cl"))
+kfile = open(outname, "wt")
+
+kfile.write("""// This file is auto-generated. Do not edit!
+#include "precomp.hpp"
+namespace cv
+{
+namespace ocl
+{
+""")
+
+for cl in cl_list:
+    cl_file = open(cl, "rt")
+    cl_filename = os.path.basename(cl)
+    cl_filename = cl_filename[:cl_filename.rfind(".")]
+    kfile.write("const char* %s=" % cl_filename)
+    state = 0
+    
+    for cl_line in cl_file.readlines():
+        l = cl_line.strip()
+        # skip the leading comments
+        if l.startswith("//") and l.find("*/") < 0:
+            if state == 0:
+                state = 1
+        else:
+            if state == 1 or l.find("*/") >= 0:
+                state = 2
+        
+        if state == 1:
+            continue
+        
+        l = l.replace("\\", "\\\\")
+        l = l.replace("\r", "")
+        l = l.replace("\"", "\\\"")
+        l = l.replace("\t", "  ")
+        kfile.write("\"%s\\n\"\n" % l)
+    kfile.write(";\n")
+    cl_file.close()
+
+kfile.write("""}
+}
+""")
+kfile.close()
--- a/modules/ocl/doc/introduction.rst
+++ b/modules/ocl/doc/introduction.rst
@ -0,0 +1,19 @@
+OpenCL Module Introduction
+==========================
+
+.. highlight:: cpp
+
+General Information
+-------------------
+
+The OpenCV OCL module is a set of classes and functions to utilize OpenCL compatible device. It should support any device compatible with OpenCL 1.1. The module includes utility functions, low-level vision primitives, and a few high-level algorithms ready to be used in the end-user applications.
+
+The OpenCV OCL module is designed as a host-level API plus device-level kernels. The device-level kernels are converted to text string and are compiled at runtime, so it need OpenCL runtime support. To correctly run the OpenCV OCL module, make sure you have OpenCL runtime provided by your device vendor, which is device driver normally.
+
+The OpenCV OCL module is designed for ease of use and does not require any knowledge of OpenCL. Though, such a knowledge will certainly be useful to handle non-trivial cases or achieve the highest performance. It is helpful to understand the cost of various operations, what the module does, what the preferred data formats are, and so on. Since there is data transfer between OpenCL host and OpenCL device, for better performance it's recommended to copy data once to the OpenCL host memory (i.e. copy ``cv::Mat`` to ``cv::ocl::OclMat``), then call several ``cv::ocl`` functions and then copy the result back to CPU memory, rather than do forward and backward transfer for each OCL function.
+
+To enable OCL support, configure OpenCV using CMake with the option ``WITH\_OPENCL=ON``. If the option is passed and if OpenCL SDK is installed (e.g. on MacOSX it's always the case), the full-featured OpenCV OCL module will be built. Otherwise, the module will not be built.
+
+Right now, the user should define the ``cv::ocl::Info`` class in the application and call ``cv::ocl::getDevice`` before any ``cv::ocl::<func>``. This operation initialize OpenCL runtime and set the first found device as computing device. If there is more than one device and you want to use non-default device, you should call ``cv::ocl::setDevice``.
+
+In the current version, all the threads share the same context and device so the multi-devices are not supported. This is to be fixed in future releases.
--- a/modules/ocl/doc/ocl.rst
+++ b/modules/ocl/doc/ocl.rst
@ -0,0 +1,19 @@
+************************************
+ocl. OpenCL-accelerated Computer Vision
+************************************
+
+.. toctree::
+    :maxdepth: 1
+
+    introduction
+    initalization_and_information
+    data_structures
+    operations_on_matrices
+    per_element_operations
+    image_processing
+    matrix_reductions
+    object_detection
+    feature_detection_and_description
+    image_filtering
+    camera_calibration_and_3d_reconstruction
+    video
--- a/modules/ocl/doc/structures_and_functions.rst
+++ b/modules/ocl/doc/structures_and_functions.rst
@ -0,0 +1,23 @@
+Data Structures and Functions
+=============================
+
+.. highlight:: cpp
+
+ocl::Info
+---------
+.. ocv:class:: ocl::Info
+
+this class should be maintained by the user and be passed to getDevice
+
+ocl::getDevice
+------------------
+Returns the list of devices
+
+.. ocv:function:: int ocl::getDevice(std::vector<Info>& oclinfo, int devicetype = CVCL_DEVICE_TYPE_GPU)
+
+    :param oclinfo: Output vector of ``ocl::Info`` structures
+    
+    :param devicetype: One of ``CVCL_DEVICE_TYPE_GPU``, ``CVCL_DEVICE_TYPE_CPU`` or ``CVCL_DEVICE_TYPE_DEFAULT``.
+    
+the function must be called before any other ``cv::ocl`` functions; it initializes ocl runtime.
+
--- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
+++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
@ -0,0 +1,456 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
+#define __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
+
+namespace cv
+{
+
+    namespace ocl
+    {
+        ////////////////////////////////////OpenCL kernel strings//////////////////////////
+        extern const char *convertC3C4;
+
+        ////////////////////////////////////////////////////////////////////////
+        //////////////////////////////// oclMat ////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////
+
+        inline oclMat::oclMat() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0) {}
+
+        inline oclMat::oclMat(int _rows, int _cols, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
+        {
+            if( _rows > 0 && _cols > 0 )
+                create( _rows, _cols, _type );
+        }
+
+        inline oclMat::oclMat(Size _size, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
+        {
+            if( _size.height > 0 && _size.width > 0 )
+                create( _size.height, _size.width, _type );
+        }
+
+        inline oclMat::oclMat(int _rows, int _cols, int _type, const Scalar &_s)
+            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
+        {
+            if(_rows > 0 && _cols > 0)
+            {
+                create(_rows, _cols, _type);
+                *this = _s;
+            }
+        }
+
+        inline oclMat::oclMat(Size _size, int _type, const Scalar &_s)
+            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
+        {
+            if( _size.height > 0 && _size.width > 0 )
+            {
+                create( _size.height, _size.width, _type );
+                *this = _s;
+            }
+        }
+
+        inline oclMat::oclMat(const oclMat &m)
+            : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data),
+              refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols)
+        {
+            if( refcount )
+                CV_XADD(refcount, 1);
+        }
+        //Fixme, the data is not correct if _data point to the CPU memory
+        inline oclMat::oclMat(int _rows, int _cols, int _type, void *_data, size_t _step)
+            : flags(Mat::MAGIC_VAL + (_type &TYPE_MASK)), rows(_rows), cols(_cols), step(_step), data((uchar *)_data), refcount(0),
+              datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_rows), wholecols(_cols)
+        {
+            size_t minstep = cols * elemSize();
+            if( step == Mat::AUTO_STEP )
+            {
+                step = minstep;
+                flags |= Mat::CONTINUOUS_FLAG;
+            }
+            else
+            {
+                if( rows == 1 ) step = minstep;
+                CV_DbgAssert( step >= minstep );
+                flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
+            }
+            dataend += step * (rows - 1) + minstep;
+        }
+        //Fixme, the data is not correct if _data point to the CPU memory
+        inline oclMat::oclMat(Size _size, int _type, void *_data, size_t _step)
+            : flags(Mat::MAGIC_VAL + (_type &TYPE_MASK)), rows(_size.height), cols(_size.width),
+              step(_step), data((uchar *)_data), refcount(0),
+              datastart((uchar *)_data), dataend((uchar *)_data), offset(0), wholerows(_size.height), wholecols(_size.width)
+        {
+            size_t minstep = cols * elemSize();
+            if( step == Mat::AUTO_STEP )
+            {
+                step = minstep;
+                flags |= Mat::CONTINUOUS_FLAG;
+            }
+            else
+            {
+                if( rows == 1 ) step = minstep;
+                CV_DbgAssert( step >= minstep );
+                flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
+            }
+            dataend += step * (rows - 1) + minstep;
+        }
+
+
+        inline oclMat::oclMat(const oclMat &m, const Range &rowRange, const Range &colRange)
+        {
+            flags = m.flags;
+            step = m.step;
+            refcount = m.refcount;
+            data = m.data;
+            datastart = m.datastart;
+            dataend = m.dataend;
+            wholerows = m.wholerows;
+            wholecols = m.wholecols;
+            offset = m.offset;
+            if( rowRange == Range::all() )
+                rows = m.rows;
+            else
+            {
+                CV_Assert( 0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows );
+                rows = rowRange.size();
+                offset += step * rowRange.start;
+            }
+
+            if( colRange == Range::all() )
+                cols = m.cols;
+            else
+            {
+                CV_Assert( 0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols );
+                cols = colRange.size();
+                offset += colRange.start * elemSize();
+                flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
+            }
+
+            if( rows == 1 )
+                flags |= Mat::CONTINUOUS_FLAG;
+
+            if( refcount )
+                CV_XADD(refcount, 1);
+            if( rows <= 0 || cols <= 0 )
+                rows = cols = 0;
+        }
+
+        inline oclMat::oclMat(const oclMat &m, const Rect &roi)
+            : flags(m.flags), rows(roi.height), cols(roi.width),
+              step(m.step), data(m.data), refcount(m.refcount),
+              datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols)
+        {
+            flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
+            offset += roi.y * step + roi.x * elemSize();
+            CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols &&
+                       0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows );
+            if( refcount )
+                CV_XADD(refcount, 1);
+            if( rows <= 0 || cols <= 0 )
+                rows = cols = 0;
+        }
+
+        inline oclMat::oclMat(const Mat &m)
+            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) , offset(0), wholerows(0), wholecols(0)
+        {
+            //clCxt = Context::getContext();
+            upload(m);
+        }
+
+        inline oclMat::~oclMat()
+        {
+            release();
+        }
+
+        inline oclMat &oclMat::operator = (const oclMat &m)
+        {
+            if( this != &m )
+            {
+                if( m.refcount )
+                    CV_XADD(m.refcount, 1);
+                release();
+                clCxt = m.clCxt;
+                flags = m.flags;
+                rows = m.rows;
+                cols = m.cols;
+                step = m.step;
+                data = m.data;
+                datastart = m.datastart;
+                dataend = m.dataend;
+                offset = m.offset;
+                wholerows = m.wholerows;
+                wholecols = m.wholecols;
+                refcount = m.refcount;
+            }
+            return *this;
+        }
+
+        inline oclMat &oclMat::operator = (const Mat &m)
+        {
+            //clCxt = Context::getContext();
+            upload(m);
+            return *this;
+        }
+
+        /* Fixme! To be supported in OpenCL later. */
+#if 0
+        template <class T> inline oclMat::operator DevMem2D_<T>() const
+        {
+            return DevMem2D_<T>(rows, cols, (T *)data, step);
+        }
+        template <class T> inline oclMat::operator PtrStep_<T>() const
+        {
+            return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this));
+        }
+#endif
+
+        //CPP: void oclMat::upload(const Mat& m);
+
+        inline oclMat::operator Mat() const
+        {
+            Mat m;
+            download(m);
+            return m;
+        }
+
+        //CPP void oclMat::download(cv::Mat& m) const;
+
+        inline oclMat oclMat::row(int y) const
+        {
+            return oclMat(*this, Range(y, y + 1), Range::all());
+        }
+        inline oclMat oclMat::col(int x) const
+        {
+            return oclMat(*this, Range::all(), Range(x, x + 1));
+        }
+        inline oclMat oclMat::rowRange(int startrow, int endrow) const
+        {
+            return oclMat(*this, Range(startrow, endrow), Range::all());
+        }
+        inline oclMat oclMat::rowRange(const Range &r) const
+        {
+            return oclMat(*this, r, Range::all());
+        }
+        inline oclMat oclMat::colRange(int startcol, int endcol) const
+        {
+            return oclMat(*this, Range::all(), Range(startcol, endcol));
+        }
+        inline oclMat oclMat::colRange(const Range &r) const
+        {
+            return oclMat(*this, Range::all(), r);
+        }
+
+        inline oclMat oclMat::clone() const
+        {
+            oclMat m;
+            copyTo(m);
+            return m;
+        }
+
+        //CPP void oclMat::copyTo( oclMat& m ) const;
+        //CPP void oclMat::copyTo( oclMat& m, const oclMat& mask  ) const;
+        //CPP void oclMat::convertTo( oclMat& m, int rtype, double alpha=1, double beta=0 ) const;
+
+        inline void oclMat::assignTo( oclMat &m, int type ) const
+        {
+            if( type < 0 )
+                m = *this;
+            else
+                convertTo(m, type);
+        }
+
+        //CPP oclMat& oclMat::operator = (const Scalar& s);
+        //CPP oclMat& oclMat::setTo(const Scalar& s, const oclMat& mask=oclMat());
+        //CPP oclMat oclMat::reshape(int _cn, int _rows=0) const;
+        inline void oclMat::create(Size _size, int _type)
+        {
+            create(_size.height, _size.width, _type);
+        }
+        //CPP void oclMat::create(int _rows, int _cols, int _type);
+        //CPP void oclMat::release();
+
+        inline void oclMat::swap(oclMat &b)
+        {
+            std::swap( flags, b.flags );
+            std::swap( rows, b.rows );
+            std::swap( cols, b.cols );
+            std::swap( step, b.step );
+            std::swap( data, b.data );
+            std::swap( datastart, b.datastart );
+            std::swap( dataend, b.dataend );
+            std::swap( refcount, b.refcount );
+            std::swap( offset, b.offset );
+            std::swap( wholerows, b.wholerows );
+            std::swap( wholecols, b.wholecols );
+        }
+
+        inline void oclMat::locateROI( Size &wholeSize, Point &ofs ) const
+        {
+            size_t esz = elemSize();//, minstep;
+            //ptrdiff_t delta1 = offset;//, delta2 = dataend - datastart;
+            CV_DbgAssert( step > 0 );
+            if( offset == 0 )
+                ofs.x = ofs.y = 0;
+            else
+            {
+                ofs.y = (int)(offset / step);
+                ofs.x = (int)((offset - step * ofs.y) / esz);
+                //CV_DbgAssert( data == datastart + ofs.y*step + ofs.x*esz );
+            }
+            //minstep = (ofs.x + cols)*esz;
+            //wholeSize.height = (int)((delta2 - minstep)/step + 1);
+            //wholeSize.height = std::max(wholeSize.height, ofs.y + rows);
+            //wholeSize.width = (int)((delta2 - step*(wholeSize.height-1))/esz);
+            //wholeSize.width = std::max(wholeSize.width, ofs.x + cols);
+            wholeSize.height = wholerows;
+            wholeSize.width = wholecols;
+        }
+
+        inline oclMat &oclMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
+        {
+            Size wholeSize;
+            Point ofs;
+            size_t esz = elemSize();
+            locateROI( wholeSize, ofs );
+            int row1 = std::max(ofs.y - dtop, 0), row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
+            int col1 = std::max(ofs.x - dleft, 0), col2 = std::min(ofs.x + cols + dright, wholeSize.width);
+            offset += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;
+            rows = row2 - row1;
+            cols = col2 - col1;
+            if( esz *cols == step || rows == 1 )
+                flags |= Mat::CONTINUOUS_FLAG;
+            else
+                flags &= ~Mat::CONTINUOUS_FLAG;
+            return *this;
+        }
+
+        inline oclMat oclMat::operator()( Range rowRange, Range colRange ) const
+        {
+            return oclMat(*this, rowRange, colRange);
+        }
+        inline oclMat oclMat::operator()( const Rect &roi ) const
+        {
+            return oclMat(*this, roi);
+        }
+
+        inline bool oclMat::isContinuous() const
+        {
+            return (flags & Mat::CONTINUOUS_FLAG) != 0;
+        }
+        inline size_t oclMat::elemSize() const
+        {
+            return CV_ELEM_SIZE(flags);
+        }
+        inline size_t oclMat::elemSize1() const
+        {
+            return CV_ELEM_SIZE1(flags);
+        }
+        inline int oclMat::type() const
+        {
+            return CV_MAT_TYPE(flags);
+        }
+        inline int oclMat::depth() const
+        {
+            return CV_MAT_DEPTH(flags);
+        }
+        inline int oclMat::channels() const
+        {
+            return CV_MAT_CN(flags);
+        }
+        inline size_t oclMat::step1() const
+        {
+            return step / elemSize1();
+        }
+        inline Size oclMat::size() const
+        {
+            return Size(cols, rows);
+        }
+        inline bool oclMat::empty() const
+        {
+            return data == 0;
+        }
+
+
+        //fixme, the ROI operation is not correct.
+        inline uchar *oclMat::ptr(int y)
+        {
+            CV_DbgAssert( (unsigned)y < (unsigned)rows );
+            return data + step * y;
+        }
+
+        inline const uchar *oclMat::ptr(int y) const
+        {
+            CV_DbgAssert( (unsigned)y < (unsigned)rows );
+            return data + step * y;
+        }
+
+        template<typename _Tp> inline _Tp *oclMat::ptr(int y)
+        {
+            CV_DbgAssert( (unsigned)y < (unsigned)rows );
+            return (_Tp *)(data + step * y);
+        }
+
+        template<typename _Tp> inline const _Tp *oclMat::ptr(int y) const
+        {
+            CV_DbgAssert( (unsigned)y < (unsigned)rows );
+            return (const _Tp *)(data + step * y);
+        }
+
+        inline oclMat oclMat::t() const
+        {
+            oclMat tmp;
+            transpose(*this, tmp);
+            return tmp;
+        }
+
+        static inline void swap( oclMat &a, oclMat &b )
+        {
+            a.swap(b);
+        }
+
+    } /* end of namespace ocl */
+
+} /* end of namespace cv */
+
+#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -0,0 +1,864 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_HPP__
+#define __OPENCV_GPU_HPP__
+
+#include <memory>
+#include <vector>
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/objdetect/objdetect.hpp"
+
+namespace cv
+{
+    namespace ocl
+    {
+        using std::auto_ptr;
+        
+#define CVCL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CVCL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CVCL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CVCL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+        //#define CVCL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CVCL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+        //this class contains ocl runtime information
+        class CV_EXPORTS Info
+        {
+        public:
+            struct Impl;
+            Impl *impl;
+
+            Info();
+            Info(const Info &m);
+            ~Info();
+            void release();
+            Info &operator = (const Info &m);
+        };
+        //////////////////////////////// Initialization & Info ////////////////////////
+        //this function may be obsoleted
+        //CV_EXPORTS cl_device_id getDevice();
+        //the function must be called before any other cv::ocl::functions, it initialize ocl runtime
+        CV_EXPORTS int getDevice(std::vector<Info>& oclinfo, int devicetype = CVCL_DEVICE_TYPE_GPU);
+        //set device you want to use, optional function after getDevice be called
+        CV_EXPORTS void setDevice(Info &oclinfo, int devnum = 0);
+        //this function is not ready yet
+        //CV_EXPORTS void getComputeCapability(cl_device_id device, int &major, int &minor);
+        //optional function, if you want save opencl binary kernel to the file, set its path
+        CV_EXPORTS  void setBinpath(const char *path);
+
+        //////////////////////////////// Error handling ////////////////////////
+        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
+
+        //////////////////////////////// OpenCL context ////////////////////////
+        //This is a global singleton class used to represent a OpenCL context.
+        class Context
+        {
+        protected:
+            Context();
+            friend class auto_ptr<Context>;
+            static auto_ptr<Context> clCxt;
+
+        public:
+            ~Context();
+            static int val;
+            static Context *getContext();
+            static void setContext(Info &oclinfo);
+            struct Impl;
+            Impl *impl;
+        };
+
+        //////////////////////////////// oclMat ////////////////////////////////
+        class CV_EXPORTS oclMat
+        {
+            public:
+            //! default constructor
+            oclMat();
+            //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
+            oclMat(int rows, int cols, int type);
+            oclMat(Size size, int type);
+            //! constucts oclMatrix and fills it with the specified value _s.
+            oclMat(int rows, int cols, int type, const Scalar &s);
+            oclMat(Size size, int type, const Scalar &s);
+            //! copy constructor
+            oclMat(const oclMat &m);
+
+            //! constructor for oclMatrix headers pointing to user-allocated data
+            oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP);
+            oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP);
+
+            //! creates a matrix header for a part of the bigger matrix
+            oclMat(const oclMat &m, const Range &rowRange, const Range &colRange);
+            oclMat(const oclMat &m, const Rect &roi);
+
+            //! builds oclMat from Mat. Perfom blocking upload to device.
+            explicit oclMat (const Mat &m);
+
+            //! destructor - calls release()
+            ~oclMat();
+
+            //! assignment operators
+            oclMat &operator = (const oclMat &m);
+            //! assignment operator. Perfom blocking upload to device.
+            oclMat &operator = (const Mat &m);
+
+            /* Fixme! To be supported in OpenCL later. */
+#if 0
+            //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
+            // Contains just image size, data ptr and step.
+            template <class T> operator DevMem2D_<T>() const;
+            template <class T> operator PtrStep_<T>() const;
+#endif
+
+            //! pefroms blocking upload data to oclMat.
+            void upload(const cv::Mat &m);
+
+            /* Fixme! To be supported in OpenCL later. */
+#if 0
+            //! upload async
+            void upload(const CudaMem &m, Stream &stream);
+#endif
+
+            //! downloads data from device to host memory. Blocking calls.
+            operator Mat() const;
+            void download(cv::Mat &m) const;
+
+            /* Fixme! To be supported in OpenCL later. */
+#if 0
+            //! download async
+            void download(CudaMem &m, Stream &stream) const;
+#endif
+
+            //! returns a new oclMatrix header for the specified row
+            oclMat row(int y) const;
+            //! returns a new oclMatrix header for the specified column
+            oclMat col(int x) const;
+            //! ... for the specified row span
+            oclMat rowRange(int startrow, int endrow) const;
+            oclMat rowRange(const Range &r) const;
+            //! ... for the specified column span
+            oclMat colRange(int startcol, int endcol) const;
+            oclMat colRange(const Range &r) const;
+
+            //! returns deep copy of the oclMatrix, i.e. the data is copied
+            oclMat clone() const;
+            //! copies the oclMatrix content to "m".
+            // It calls m.create(this->size(), this->type()).
+            // It supports any data type
+            void copyTo( oclMat &m ) const;
+            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
+            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+            void copyTo( oclMat &m, const oclMat &mask ) const;
+            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
+            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
+
+            void assignTo( oclMat &m, int type = -1 ) const;
+
+            //! sets every oclMatrix element to s
+            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+            oclMat &operator = (const Scalar &s);
+            //! sets some of the oclMatrix elements to s, according to the mask
+            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
+            oclMat &setTo(const Scalar &s, const oclMat &mask = oclMat());
+            //! creates alternative oclMatrix header for the same data, with different
+            // number of channels and/or different number of rows. see cvReshape.
+            oclMat reshape(int cn, int rows = 0) const;
+
+            //! allocates new oclMatrix data unless the oclMatrix already has specified size and type.
+            // previous data is unreferenced if needed.
+            void create(int rows, int cols, int type);
+            void create(Size size, int type);
+            //! decreases reference counter;
+            // deallocate the data when reference counter reaches 0.
+            void release();
+
+            //! swaps with other smart pointer
+            void swap(oclMat &mat);
+
+            //! locates oclMatrix header within a parent oclMatrix. See below
+            void locateROI( Size &wholeSize, Point &ofs ) const;
+            //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix.
+            oclMat &adjustROI( int dtop, int dbottom, int dleft, int dright );
+            //! extracts a rectangular sub-oclMatrix
+            // (this is a generalized form of row, rowRange etc.)
+            oclMat operator()( Range rowRange, Range colRange ) const;
+            oclMat operator()( const Rect &roi ) const;
+
+            //! returns true if the oclMatrix data is continuous
+            // (i.e. when there are no gaps between successive rows).
+            // similar to CV_IS_oclMat_CONT(cvoclMat->type)
+            bool isContinuous() const;
+            //! returns element size in bytes,
+            // similar to CV_ELEM_SIZE(cvMat->type)
+            size_t elemSize() const;
+            //! returns the size of element channel in bytes.
+            size_t elemSize1() const;
+            //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
+            int type() const;
+            //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
+            int depth() const;
+            //! returns element type, similar to CV_MAT_CN(cvMat->type)
+            int channels() const;
+            //! returns step/elemSize1()
+            size_t step1() const;
+            //! returns oclMatrix size:
+            // width == number of columns, height == number of rows
+            Size size() const;
+            //! returns true if oclMatrix data is NULL
+            bool empty() const;
+
+            //! returns pointer to y-th row
+            uchar *ptr(int y = 0);
+            const uchar *ptr(int y = 0) const;
+
+            //! template version of the above method
+            template<typename _Tp> _Tp *ptr(int y = 0);
+            template<typename _Tp> const _Tp *ptr(int y = 0) const;
+
+            //! matrix transposition
+            oclMat t() const;
+
+            /*! includes several bit-fields:
+              - the magic signature
+              - continuity flag
+              - depth
+              - number of channels
+              */
+            int flags;
+            //! the number of rows and columns
+            int rows, cols;
+            //! a distance between successive rows in bytes; includes the gap if any
+            size_t step;
+            //! pointer to the data(OCL memory object)
+            uchar *data;
+
+            //! pointer to the reference counter;
+            // when oclMatrix points to user-allocated data, the pointer is NULL
+            int *refcount;
+
+            //! helper fields used in locateROI and adjustROI
+            //datastart and dataend are not used in current version
+            uchar *datastart;
+            uchar *dataend;
+
+            //! OpenCL context associated with the oclMat object.
+            Context *clCxt;
+            //add offset for handle ROI, calculated in byte
+            int offset;
+            //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
+            int wholerows;
+            int wholecols;
+            //add download_channels for 3 channels to 4 channels
+            int download_channels;
+        };
+
+        ///////////////////// mat split and merge /////////////////////////////////
+        //! Compose a multi-channel array from several single-channel arrays
+        // Support all types
+        CV_EXPORTS void merge(const oclMat *src, size_t n, oclMat &dst);
+        CV_EXPORTS void merge(const vector<oclMat> &src, oclMat &dst);
+
+        //! Divides multi-channel array into several single-channel arrays
+        // Support all types
+        CV_EXPORTS void split(const oclMat &src, oclMat *dst);
+        CV_EXPORTS void split(const oclMat &src, vector<oclMat> &dst);
+
+        ////////////////////////////// Arithmetics ///////////////////////////////////
+        //#if defined DOUBLE_SUPPORT
+        //typedef double F;
+        //#else
+        //typedef float F;
+        //#endif
+        //  CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
+        CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
+        //! adds one matrix to another (c = a + b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c);
+        //! adds one matrix to another (c = a + b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
+        //! adds scalar to a matrix (c = a + s)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
+        //! subtracts one matrix from another (c = a - b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c);
+        //! subtracts one matrix from another (c = a - b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
+        //! subtracts scalar from a matrix (c = a - s)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
+        //! subtracts scalar from a matrix (c = a - s)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat());
+        //! computes element-wise product of the two arrays (c = a * b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+        //! computes element-wise quotient of the two arrays (c = a / b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
+        //! computes element-wise quotient of the two arrays (c = a / b)
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void divide(double scale, const oclMat &b, oclMat &c);
+
+        //! compares elements of two arrays (c = a <cmpop> b)
+        // supports except CV_8SC1,CV_8SC2,CV8SC3,CV_8SC4 types
+        CV_EXPORTS void compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop);
+
+        //! transposes the matrix
+        // supports  CV_8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1.(the same as cuda)
+        CV_EXPORTS void transpose(const oclMat &src1, oclMat &dst);
+
+        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void absdiff(const oclMat &a, const oclMat &b, oclMat &c);
+        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
+        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
+        CV_EXPORTS void absdiff(const oclMat &a, const Scalar &s, oclMat &c);
+
+        //! computes mean value and standard deviation of all or selected array elements
+        // supports except CV_32F,CV_64F
+        CV_EXPORTS void meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev);
+
+        //! computes norm of array
+        // supports NORM_INF, NORM_L1, NORM_L2
+        // supports only CV_8UC1 type
+        CV_EXPORTS double norm(const oclMat &src1, int normType = NORM_L2);
+
+        //! computes norm of the difference between two arrays
+        // supports NORM_INF, NORM_L1, NORM_L2
+        // supports only CV_8UC1 type
+        CV_EXPORTS double norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2);
+
+        //! reverses the order of the rows, columns or both in a matrix
+        // supports all types
+        CV_EXPORTS void flip(const oclMat &a, oclMat &b, int flipCode);
+
+        //! computes sum of array elements
+        // disabled until fix crash
+        // support all types
+        CV_EXPORTS Scalar sum(const oclMat &m);
+
+        //! finds global minimum and maximum array elements and returns their values
+        // support all types
+        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
+
+        //! finds global minimum and maximum array elements and returns their values with locations
+        // support all types
+        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
+                const oclMat &mask = oclMat());
+
+        //! counts non-zero array elements
+        // support all types
+        CV_EXPORTS int countNonZero(const oclMat &src);
+
+        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
+        // destination array will have the depth type as lut and the same channels number as source
+        //It supports 8UC1 8UC4 only
+        CV_EXPORTS void LUT(const oclMat &src, const oclMat &lut, oclMat &dst);
+
+        //! only 8UC1 and 256 bins is supported now
+        CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
+        //! only 8UC1 and 256 bins is supported now
+        CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
+        //! bilateralFilter
+        // supports 8UC1 8UC4
+        CV_EXPORTS void bilateralFilter(const oclMat &, oclMat &, int , double, double, int);
+        //! computes exponent of each matrix element (b = e**a)
+        // supports only CV_32FC1 type
+        CV_EXPORTS void exp(const oclMat &a, oclMat &b);
+
+        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
+        // supports only CV_32FC1 type
+        CV_EXPORTS void log(const oclMat &a, oclMat &b);
+
+        //! computes magnitude of each (x(i), y(i)) vector
+        // supports only CV_32F CV_64F type
+        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
+        CV_EXPORTS void magnitudeSqr(const oclMat &x, const oclMat &y, oclMat &magnitude);
+
+        CV_EXPORTS void magnitudeSqr(const oclMat &x, oclMat &magnitude);
+
+        //! computes angle (angle(i)) of each (x(i), y(i)) vector
+        // supports only CV_32F CV_64F type
+        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
+
+        //! the function raises every element of tne input array to p
+        //! support only CV_32F CV_64F type
+        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
+
+        //! converts Cartesian coordinates to polar
+        // supports only CV_32F CV_64F type
+        CV_EXPORTS void cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false);
+
+        //! converts polar coordinates to Cartesian
+        // supports only CV_32F CV_64F type
+        CV_EXPORTS void polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false);
+
+        //! perfroms per-elements bit-wise inversion
+        // supports all types
+        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
+        //! calculates per-element bit-wise disjunction of two arrays
+        // supports all types
+        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+        //! calculates per-element bit-wise conjunction of two arrays
+        // supports all types
+        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+        //! calculates per-element bit-wise "exclusive or" operation
+        // supports all types
+        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! Logical operators
+        CV_EXPORTS oclMat operator ~ (const oclMat &src);
+        CV_EXPORTS oclMat operator | (const oclMat &src1, const oclMat &src2);
+        CV_EXPORTS oclMat operator & (const oclMat &src1, const oclMat &src2);
+        CV_EXPORTS oclMat operator ^ (const oclMat &src1, const oclMat &src2);
+        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
+
+        //////////////////////////////// Filter Engine ////////////////////////////////
+
+        /*!
+          The Base Class for 1D or Row-wise Filters
+
+          This is the base class for linear or non-linear filters that process 1D data.
+          In particular, such filters are used for the "horizontal" filtering parts in separable filters.
+          */
+        class CV_EXPORTS BaseRowFilter_GPU
+        {
+            public:
+                BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+                virtual ~BaseRowFilter_GPU() {}
+                virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+                int ksize, anchor, bordertype;
+        };
+
+        /*!
+          The Base Class for Column-wise Filters
+
+          This is the base class for linear or non-linear filters that process columns of 2D arrays.
+          Such filters are used for the "vertical" filtering parts in separable filters.
+          */
+        class CV_EXPORTS BaseColumnFilter_GPU
+        {
+            public:
+                BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+                virtual ~BaseColumnFilter_GPU() {}
+                virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+                int ksize, anchor, bordertype;
+        };
+
+        /*!
+          The Base Class for Non-Separable 2D Filters.
+
+          This is the base class for linear or non-linear 2D filters.
+          */
+        class CV_EXPORTS BaseFilter_GPU
+        {
+            public:
+                BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
+                    : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
+                virtual ~BaseFilter_GPU() {}
+                virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+                Size ksize;
+                Point anchor;
+                int borderType;
+        };
+
+        /*!
+          The Base Class for Filter Engine.
+
+          The class can be used to apply an arbitrary filtering operation to an image.
+          It contains all the necessary intermediate buffers.
+          */
+        class CV_EXPORTS FilterEngine_GPU
+        {
+            public:
+                virtual ~FilterEngine_GPU() {}
+
+                virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
+        };
+
+        //! returns the non-separable filter engine with the specified filter
+        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
+
+        //! returns the primitive row filter with the specified kernel
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel,
+                int anchor = -1, int bordertype = BORDER_DEFAULT);
+
+        //! returns the primitive column filter with the specified kernel
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel,
+                int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0);
+
+        //! returns the separable linear filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
+                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+
+        //! returns the separable filter engine with the specified filters
+        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
+                const Ptr<BaseColumnFilter_GPU>& columnFilter);
+
+        //! returns the Gaussian filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+
+        //! returns filter engine for the generalized Sobel operator
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType=BORDER_DEFAULT );
+
+        //! applies Laplacian operator to the image
+        // supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type
+        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1);
+
+        //! returns 2D box filter
+        // supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
+        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
+                const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+
+        //! returns box filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size &ksize,
+                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+
+        //! returns 2D filter with the specified kernel
+        // supports CV_8UC1 and CV_8UC4 types
+        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
+                Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+
+        //! returns the non-separable linear filter engine
+        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
+                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+
+        //! smooths the image using the normalized box filter
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
+        CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
+                Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+
+        //! returns 2D morphological filter
+        //! only MORPH_ERODE and MORPH_DILATE are supported
+        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+        // kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize,
+                Point anchor = Point(-1, -1));
+
+        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
+        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat &kernel,
+                const Point &anchor = Point(-1, -1), int iterations = 1);
+
+        //! a synonym for normalized box filter
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+        static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
+                int borderType = BORDER_CONSTANT)
+        {
+            boxFilter(src, dst, -1, ksize, anchor, borderType);
+        }
+
+        //! applies non-separable 2D linear filter to the image
+        CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
+                Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
+
+        //! applies separable 2D linear filter to the image
+        CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
+                Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+
+        //! applies generalized Sobel operator to the image
+        // dst.type must equalize src.type
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+        CV_EXPORTS void Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
+
+        //! applies the vertical or horizontal Scharr operator to the image
+        // dst.type must equalize src.type
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+        CV_EXPORTS void Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
+
+        //! smooths the image using Gaussian filter.
+        // dst.type must equalize src.type
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
+        CV_EXPORTS void GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+
+        //! erodes the image (applies the local minimum operator)
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+        //! dilates the image (applies the local maximum operator)
+        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
+        CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+        //! applies an advanced morphological operation to the image
+        CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+        ////////////////////////////// Image processing //////////////////////////////
+        //! Does mean shift filtering on GPU.
+        CV_EXPORTS void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr,
+                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+
+        //! Does mean shift procedure on GPU.
+        CV_EXPORTS void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr,
+                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+
+        //! Does mean shift segmentation with elimiation of small regions.
+        CV_EXPORTS void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize,
+                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+
+        //! applies fixed threshold to the image.
+        // supports CV_8UC1 and CV_32FC1 data type
+        // supports threshold type: THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV
+        CV_EXPORTS double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC);
+
+        //! resizes the image
+        // Supports INTER_NEAREST, INTER_LINEAR
+        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+        CV_EXPORTS void resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
+
+        //! Applies a generic geometrical transformation to an image.
+            // Supports INTER_NEAREST, INTER_LINEAR.
+            // Map1 supports CV_16SC2, CV_32FC2  types.
+        // Src supports CV_8UC1, CV_8UC2, CV_8UC4.
+        CV_EXPORTS void remap(const oclMat& src, oclMat& dst, oclMat& map1, oclMat& map2, int interpolation, int bordertype, const Scalar& value = Scalar());
+        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
+        // supports CV_8UC1, CV_8UC4, CV_32SC1 types
+        CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar());
+
+        //! Smoothes image using median filter
+        // The source 1- or 4-channel image. When m is 3 or 5, the image depth should be CV 8U or CV 32F.
+        CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m);
+
+        //! warps the image using affine transformation
+        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+        CV_EXPORTS void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
+
+        //! warps the image using perspective transformation
+        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
+        CV_EXPORTS void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
+
+        //! computes the integral image and integral for the squared image
+        // sum will have CV_32S type, sqsum - CV32F type
+        // supports only CV_8UC1 source type
+        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
+        CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
+        CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
+        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
+
+
+        //////////////////////////////// StereoBM_GPU ////////////////////////////////
+        class CV_EXPORTS StereoBM_GPU
+        {
+        public:
+            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+
+            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
+
+            //! the default constructor
+            StereoBM_GPU();
+            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
+            StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
+
+            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
+            //! Output disparity has CV_8U type.
+            void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
+
+            //! Some heuristics that tries to estmate
+            // if current GPU will be faster then CPU in this algorithm.
+            // It queries current active device.
+            static bool checkIfGpuCallReasonable();
+
+            int preset;
+            int ndisp;
+            int winSize;
+
+            // If avergeTexThreshold  == 0 => post procesing is disabled
+            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
+            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
+            // i.e. input left image is low textured.
+            float avergeTexThreshold;
+        private:
+            oclMat minSSD, leBuf, riBuf;
+        };
+
+        ////////////////////////// StereoBeliefPropagation ///////////////////////////
+        // "Efficient Belief Propagation for Early Vision"
+        // P.Felzenszwalb
+
+        class CV_EXPORTS StereoBeliefPropagation
+        {
+        public:
+            enum { DEFAULT_NDISP  = 64 };
+            enum { DEFAULT_ITERS  = 5  };
+            enum { DEFAULT_LEVELS = 5  };
+
+            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
+
+            //! the default constructor
+            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
+                    int iters  = DEFAULT_ITERS,
+                    int levels = DEFAULT_LEVELS,
+                    int msg_type = CV_16S);
+
+            //! the full constructor taking the number of disparities, number of BP iterations on each level,
+            //! number of levels, truncation of data cost, data weight,
+            //! truncation of discontinuity cost and discontinuity single jump
+            //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term)
+            //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term)
+            //! please see paper for more details
+            StereoBeliefPropagation(int ndisp, int iters, int levels,
+                    float max_data_term, float data_weight,
+                    float max_disc_term, float disc_single_jump,
+                    int msg_type = CV_32F);
+
+            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
+            //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
+            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+
+            //! version for user specified data term
+            void operator()(const oclMat &data, oclMat &disparity);
+
+            int ndisp;
+
+            int iters;
+            int levels;
+
+            float max_data_term;
+            float data_weight;
+            float max_disc_term;
+            float disc_single_jump;
+
+            int msg_type;
+        private:
+            oclMat u, d, l, r, u2, d2, l2, r2;
+            std::vector<oclMat> datas;
+            oclMat out;
+        };
+
+        /////////////////////////// StereoConstantSpaceBP ///////////////////////////
+        // "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
+        // Qingxiong Yang, Liang Wangï¿? Narendra Ahuja
+        // http://vision.ai.uiuc.edu/~qyang6/
+
+        class CV_EXPORTS StereoConstantSpaceBP
+        {
+        public:
+            enum { DEFAULT_NDISP    = 128 };
+            enum { DEFAULT_ITERS    = 8   };
+            enum { DEFAULT_LEVELS   = 4   };
+            enum { DEFAULT_NR_PLANE = 4   };
+
+            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
+
+            //! the default constructor
+            explicit StereoConstantSpaceBP(int ndisp    = DEFAULT_NDISP,
+                    int iters    = DEFAULT_ITERS,
+                    int levels   = DEFAULT_LEVELS,
+                    int nr_plane = DEFAULT_NR_PLANE,
+                    int msg_type = CV_32F);
+
+            //! the full constructor taking the number of disparities, number of BP iterations on each level,
+            //! number of levels, number of active disparity on the first level, truncation of data cost, data weight,
+            //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold
+            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
+                    float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
+                    int min_disp_th = 0,
+                    int msg_type = CV_32F);
+
+            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
+            //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
+            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+
+            int ndisp;
+
+            int iters;
+            int levels;
+
+            int nr_plane;
+
+            float max_data_term;
+            float data_weight;
+            float max_disc_term;
+            float disc_single_jump;
+
+            int min_disp_th;
+
+            int msg_type;
+
+            bool use_local_init_data_cost;
+        private:
+            oclMat u[2], d[2], l[2], r[2];
+            oclMat disp_selected_pyr[2];
+
+            oclMat data_cost;
+            oclMat data_cost_selected;
+
+            oclMat temp;
+
+            oclMat out;
+        };
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
+        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+        class CV_EXPORTS_W OclCascadeClassifier : public  cv::CascadeClassifier
+        {
+        public:
+            OclCascadeClassifier() {};
+            ~OclCascadeClassifier() {};
+
+            CvSeq *oclHaarDetectObjects(oclMat &gimg, CvMemStorage *storage, double scaleFactor,
+                    int minNeighbors, int flags, CvSize minSize = cvSize(0, 0), CvSize maxSize = cvSize(0, 0));
+        };
+
+        ///////////////////////////////////////////////////////jhp_benchmark////////////////////////////////////////////////////
+        void benchmark_copy_vectorize(const oclMat &src, oclMat &dst);
+        void benchmark_copy_offset_stride(const oclMat &src, oclMat &dst);
+        void benchmark_ILP();
+        
+    }
+}
+#include "opencv2/ocl/matrix_operations.hpp"
+#endif /* __OPENCV_GPU_HPP__ */
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
--- a/modules/ocl/src/binarycaching.hpp
+++ b/modules/ocl/src/binarycaching.hpp
@ -0,0 +1,102 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+using std::cout;
+using std::endl;
+
+#if !defined (HAVE_OPENCL)
+namespace cv
+{
+    namespace ocl
+    {
+        //nothing
+    }//namespace ocl
+}//namespace cv
+
+#else /* !defined (HAVE_OPENCL) */
+
+namespace cv
+{
+    namespace ocl
+    {
+        class ProgramCache
+        {
+        protected:
+            ProgramCache();
+            friend class auto_ptr<ProgramCache>;
+            static auto_ptr<ProgramCache> programCache;
+
+        public:
+            ~ProgramCache();
+            static ProgramCache *getProgramCache()
+            {
+                if( NULL == programCache.get())
+                    programCache.reset(new ProgramCache());
+                return programCache.get();
+            }
+
+            //lookup the binary given the file name
+            cl_program progLookup(string srcsign);
+
+            //add program to the cache
+            void addProgram(string srcsign, cl_program program);
+            void releaseProgram();
+
+            map <string, cl_program> codeCache;
+            unsigned int cacheSize;
+            //The presumed watermark for the cache volume (256MB). Is it enough?
+            //We may need more delicate algorithms when necessary later.
+            //Right now, let's just leave it along.
+            static const unsigned MAX_PROG_CACHE_SIZE = 1024;
+        };
+
+    }//namespace ocl
+
+}//namespace cv
+#endif
--- a/modules/ocl/src/color.cpp
+++ b/modules/ocl/src/color.cpp
@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wang Weiyan, wangweiyanster@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+
+#if !defined (HAVE_OPENCL)
+
+void cv::ocl::cvtColor(const oclMat &, oclMat &, int, int)
+{
+    throw_nogpu();
+}
+void cv::ocl::cvtColor(const oclMat &, oclMat &, int, int, const Stream &)
+{
+    throw_nogpu();
+}
+
+#else /* !defined (HAVE_OPENCL) */
+#ifndef CV_DESCALE
+#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+#endif
+
+#ifndef FLT_EPSILON
+#define FLT_EPSILON     1.192092896e-07F
+#endif
+
+namespace cv
+{
+    namespace ocl
+    {
+        extern const char *cvt_color;
+    }
+}
+
+namespace
+{
+    void RGB2Gray_caller(const oclMat &src, oclMat &dst, int bidx)
+    {
+        vector<pair<size_t , const void *> > args;
+        int channels = src.channels();
+        char build_options[50];
+        //printf("depth:%d,channels:%d,bidx:%d\n",src.depth(),src.channels(),bidx);
+        sprintf(build_options, "-D DEPTH_%d", src.depth());
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&channels));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx));
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
+        size_t gt[3] = {src.cols, src.rows, 1}, lt[3] = {16, 16, 1};
+        openCLExecuteKernel(src.clCxt, &cvt_color, "RGB2Gray", gt, lt, args, -1, -1, build_options);
+    }
+    void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
+    {
+        Size sz = src.size();
+        int scn = src.channels(), depth = src.depth(), bidx;
+
+        CV_Assert(depth == CV_8U || depth == CV_16U);
+
+        switch (code)
+        {
+            /*
+            case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
+            case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
+            case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
+            case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
+            case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
+            case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
+            */
+        case CV_BGR2GRAY:
+        case CV_BGRA2GRAY:
+        case CV_RGB2GRAY:
+        case CV_RGBA2GRAY:
+        {
+            CV_Assert(scn == 3 || scn == 4);
+            bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
+            dst.create(sz, CV_MAKETYPE(depth, 1));
+            RGB2Gray_caller(src, dst, bidx);
+            break;
+        }
+        /*
+        case CV_BGR5652GRAY: case CV_BGR5552GRAY:
+        case CV_GRAY2BGR: case CV_GRAY2BGRA:
+        case CV_GRAY2BGR565: case CV_GRAY2BGR555:
+        case CV_BGR2YCrCb: case CV_RGB2YCrCb:
+        case CV_BGR2YUV: case CV_RGB2YUV:
+        case CV_YCrCb2BGR: case CV_YCrCb2RGB:
+        case CV_YUV2BGR: case CV_YUV2RGB:
+        case CV_BGR2XYZ: case CV_RGB2XYZ:
+        case CV_XYZ2BGR: case CV_XYZ2RGB:
+        case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
+        case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
+        case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
+        case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
+        */
+        default:
+            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+        }
+    }
+}
+
+void cv::ocl::cvtColor(const oclMat &src, oclMat &dst, int code, int dcn)
+{
+    cvtColor_caller(src, dst, code, dcn);
+}
+
+#endif /* !defined (HAVE_OPENCL) */
--- a/modules/ocl/src/error.cpp
+++ b/modules/ocl/src/error.cpp
@ -0,0 +1,198 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+
+#if !defined (HAVE_OPENCL)
+
+// do nothing
+
+#else /* !defined (HAVE_OPENCL) */
+
+namespace cv
+{
+    namespace ocl
+    {
+
+        const char *getOpenCLErrorString( int err )
+        {
+            switch(err)
+            {
+            case CL_DEVICE_NOT_FOUND:
+                return "CL_DEVICE_NOT_FOUND";
+            case CL_DEVICE_NOT_AVAILABLE:
+                return "CL_DEVICE_NOT_AVAILABLE";
+            case CL_COMPILER_NOT_AVAILABLE:
+                return "CL_COMPILER_NOT_AVAILABLE";
+            case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+                return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+            case CL_OUT_OF_RESOURCES:
+                return "CL_OUT_OF_RESOURCES";
+            case CL_OUT_OF_HOST_MEMORY:
+                return "CL_OUT_OF_HOST_MEMORY";
+            case CL_PROFILING_INFO_NOT_AVAILABLE:
+                return "CL_PROFILING_INFO_NOT_AVAILABLE";
+            case CL_MEM_COPY_OVERLAP:
+                return "CL_MEM_COPY_OVERLAP";
+            case CL_IMAGE_FORMAT_MISMATCH:
+                return "CL_IMAGE_FORMAT_MISMATCH";
+            case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+                return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+            case CL_BUILD_PROGRAM_FAILURE:
+                return "CL_BUILD_PROGRAM_FAILURE";
+            case CL_MAP_FAILURE:
+                return "CL_MAP_FAILURE";
+            case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+                return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+            case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+                return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+            case CL_INVALID_VALUE:
+                return "CL_INVALID_VALUE";
+            case CL_INVALID_DEVICE_TYPE:
+                return "CL_INVALID_DEVICE_TYPE";
+            case CL_INVALID_PLATFORM:
+                return "CL_INVALID_PLATFORM";
+            case CL_INVALID_DEVICE:
+                return "CL_INVALID_DEVICE";
+            case CL_INVALID_CONTEXT:
+                return "CL_INVALID_CONTEXT";
+            case CL_INVALID_QUEUE_PROPERTIES:
+                return "CL_INVALID_QUEUE_PROPERTIES";
+            case CL_INVALID_COMMAND_QUEUE:
+                return "CL_INVALID_COMMAND_QUEUE";
+            case CL_INVALID_HOST_PTR:
+                return "CL_INVALID_HOST_PTR";
+            case CL_INVALID_MEM_OBJECT:
+                return "CL_INVALID_MEM_OBJECT";
+            case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+                return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+            case CL_INVALID_IMAGE_SIZE:
+                return "CL_INVALID_IMAGE_SIZE";
+            case CL_INVALID_SAMPLER:
+                return "CL_INVALID_SAMPLER";
+            case CL_INVALID_BINARY:
+                return "CL_INVALID_BINARY";
+            case CL_INVALID_BUILD_OPTIONS:
+                return "CL_INVALID_BUILD_OPTIONS";
+            case CL_INVALID_PROGRAM:
+                return "CL_INVALID_PROGRAM";
+            case CL_INVALID_PROGRAM_EXECUTABLE:
+                return "CL_INVALID_PROGRAM_EXECUTABLE";
+            case CL_INVALID_KERNEL_NAME:
+                return "CL_INVALID_KERNEL_NAME";
+            case CL_INVALID_KERNEL_DEFINITION:
+                return "CL_INVALID_KERNEL_DEFINITION";
+            case CL_INVALID_KERNEL:
+                return "CL_INVALID_KERNEL";
+            case CL_INVALID_ARG_INDEX:
+                return "CL_INVALID_ARG_INDEX";
+            case CL_INVALID_ARG_VALUE:
+                return "CL_INVALID_ARG_VALUE";
+            case CL_INVALID_ARG_SIZE:
+                return "CL_INVALID_ARG_SIZE";
+            case CL_INVALID_KERNEL_ARGS:
+                return "CL_INVALID_KERNEL_ARGS";
+            case CL_INVALID_WORK_DIMENSION:
+                return "CL_INVALID_WORK_DIMENSION";
+            case CL_INVALID_WORK_GROUP_SIZE:
+                return "CL_INVALID_WORK_GROUP_SIZE";
+            case CL_INVALID_WORK_ITEM_SIZE:
+                return "CL_INVALID_WORK_ITEM_SIZE";
+            case CL_INVALID_GLOBAL_OFFSET:
+                return "CL_INVALID_GLOBAL_OFFSET";
+            case CL_INVALID_EVENT_WAIT_LIST:
+                return "CL_INVALID_EVENT_WAIT_LIST";
+            case CL_INVALID_EVENT:
+                return "CL_INVALID_EVENT";
+            case CL_INVALID_OPERATION:
+                return "CL_INVALID_OPERATION";
+            case CL_INVALID_GL_OBJECT:
+                return "CL_INVALID_GL_OBJECT";
+            case CL_INVALID_BUFFER_SIZE:
+                return "CL_INVALID_BUFFER_SIZE";
+            case CL_INVALID_MIP_LEVEL:
+                return "CL_INVALID_MIP_LEVEL";
+            case CL_INVALID_GLOBAL_WORK_SIZE:
+                return "CL_INVALID_GLOBAL_WORK_SIZE";
+                //case CL_INVALID_PROPERTY:
+                //	return "CL_INVALID_PROPERTY";
+                //case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
+                //	return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+                //case CL_PLATFORM_NOT_FOUND_KHR:
+                //	return "CL_PLATFORM_NOT_FOUND_KHR";
+                //	//case CL_INVALID_PROPERTY_EXT:
+                //	//    return "CL_INVALID_PROPERTY_EXT";
+                //case CL_DEVICE_PARTITION_FAILED_EXT:
+                //	return "CL_DEVICE_PARTITION_FAILED_EXT";
+                //case CL_INVALID_PARTITION_COUNT_EXT:
+                //	return "CL_INVALID_PARTITION_COUNT_EXT";
+                //default:
+                //	return "unknown error code";
+            }
+            static char buf[256];
+            sprintf(buf, "%d", err);
+            return buf;
+        }
+
+        void error(const char *error_string, const char *file, const int line, const char *func)
+        {
+            int code = CV_GpuApiCallError;
+
+            if (std::uncaught_exception())
+            {
+                const char *errorStr = cvErrorStr(code);
+                const char *function = func ? func : "unknown function";
+
+                std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+                std::cerr.flush();
+            }
+            else
+                cv::error( cv::Exception(code, error_string, func, file, line) );
+        }
+    }
+}
+
+#endif
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@ -0,0 +1,885 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Guoping Long, longguoping@gmail.com
+//	  Niko Li, newlife20080214@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "threadsafe.h"
+#include <iomanip>
+#include "binarycaching.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+using std::cout;
+using std::endl;
+
+//#define PRINT_KERNEL_RUN_TIME
+#define RUN_TIMES 100
+
+//#define AMD_DOUBLE_DIFFER
+
+#if !defined (HAVE_OPENCL)
+
+namespace cv
+{
+    namespace ocl
+    {
+
+        cl_device_id getDevice()
+        {
+            throw_nogpu();
+            return 0;
+        }
+
+        void getComputeCapability(cl_device_id, int &major, int &minor)
+        {
+            throw_nogpu();
+        }
+
+        void openCLMallocPitch(Context * /*clCxt*/, void ** /*dev_ptr*/, size_t * /*pitch*/,
+                size_t /*widthInBytes*/, size_t /*height*/)
+        {
+            throw_nogpu();
+        }
+
+        void openCLMemcpy2D(Context * /*clCxt*/, void * /*dst*/, size_t /*dpitch*/,
+                const void * /*src*/, size_t /*spitch*/,
+                size_t /*width*/, size_t /*height*/, enum openCLMemcpyKind /*kind*/)
+        {
+            throw_nogpu();
+        }
+
+        void openCLCopyBuffer2D(Context * /*clCxt*/, void * /*dst*/, size_t /*dpitch*/,
+                const void * /*src*/, size_t /*spitch*/,
+                size_t /*width*/, size_t /*height*/, enum openCLMemcpyKind /*kind*/)
+        {
+            throw_nogpu();
+        }
+
+        cl_mem openCLCreateBuffer(Context *,size_t, size_t)
+        {
+            throw_nogpu();
+        }
+
+        void openCLReadBuffer(Context *, cl_mem, void*, size_t)
+        {
+            throw_nogpu();
+        }
+
+        void openCLFree(void * /*devPtr*/)
+        {
+            throw_nogpu();
+        }
+
+        cl_kernel openCLGetKernelFromSource(const Context * /*clCxt*/,
+                const char ** /*fileName*/, string /*kernelName*/)
+        {
+            throw_nogpu();
+        }
+
+        void openCLVerifyKernel(const Context * /*clCxt*/, cl_kernel /*kernel*/, size_t * /*blockSize*/,
+                size_t * /*globalThreads*/, size_t * /*localThreads*/)
+        {
+            throw_nogpu();
+        }
+
+        cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+                const size_t size)
+        {
+            throw_nogpu();
+        }
+
+    }//namespace ocl
+}//namespace cv
+
+#else /* !defined (HAVE_OPENCL) */
+
+namespace cv
+{
+    namespace ocl
+    {
+        /*
+         * The binary caching system to eliminate redundant program source compilation.
+         * Strictly, this is not a cache because we do not implement evictions right now.
+         * We shall add such features to trade-off memory consumption and performance when necessary.
+         */
+        auto_ptr<ProgramCache> ProgramCache::programCache;
+        ProgramCache *programCache = NULL;
+        ProgramCache::ProgramCache()
+        {
+            codeCache.clear();
+            cacheSize = 0;
+        }
+
+        ProgramCache::~ProgramCache()
+        {
+            releaseProgram();
+        }
+
+        cl_program ProgramCache::progLookup(string srcsign)
+        {
+            map<string, cl_program>::iterator iter;
+            iter = codeCache.find(srcsign);
+            if(iter != codeCache.end())
+                return iter->second;
+            else
+                return NULL;
+        }
+
+        void ProgramCache::addProgram(string srcsign , cl_program program)
+        {
+            if(!progLookup(srcsign))
+            {
+                codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
+            }
+        }
+
+        void ProgramCache::releaseProgram()
+        {
+            map<string, cl_program>::iterator iter;
+            for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
+            {
+                openCLSafeCall(clReleaseProgram(iter->second));
+            }
+            codeCache.clear();
+            cacheSize = 0;
+        }
+
+        ////////////////////////Common OpenCL specific calls///////////////
+        //Info::Info()
+        //{
+        //	oclplatform = 0;
+        //	oclcontext = 0;
+        //	devnum = 0;
+        //}
+        //Info::~Info()
+        //{
+        //	release();
+        //}
+        //void Info::release()
+        //{
+        //	if(oclplatform)
+        //	{
+        //		oclplatform = 0;
+        //	}
+        //	if(oclcontext)
+        //	{
+        //		openCLSafeCall(clReleaseContext(oclcontext));
+        //	}
+        //	devices.empty();
+        //	devName.empty();
+        //}
+        struct Info::Impl
+        {
+            cl_platform_id oclplatform;
+            std::vector<cl_device_id> devices;
+            std::vector<std::string> devName;
+
+            cl_context oclcontext;
+            cl_command_queue clCmdQueue;
+            int devnum;
+            cl_uint maxDimensions;
+            size_t maxWorkGroupSize;
+            size_t *maxWorkItemSizes;
+            cl_uint maxComputeUnits;
+            char extra_options[512];
+            int  double_support;
+            Impl()
+            {
+                memset(extra_options,0,512);
+            }
+        };
+
+        inline int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        int getDevice(std::vector<Info> &oclinfo, int devicetype)
+        {
+            cl_device_type _devicetype;
+            switch(devicetype)
+            {
+                case CVCL_DEVICE_TYPE_DEFAULT:
+                    _devicetype = CL_DEVICE_TYPE_DEFAULT;
+                    break;
+                case CVCL_DEVICE_TYPE_CPU:
+                    _devicetype = CL_DEVICE_TYPE_CPU;
+                    break;
+                case CVCL_DEVICE_TYPE_GPU:
+                    _devicetype = CL_DEVICE_TYPE_GPU;
+                    break;
+                case CVCL_DEVICE_TYPE_ACCELERATOR:
+                    _devicetype = CL_DEVICE_TYPE_ACCELERATOR;
+                    break;
+                case CVCL_DEVICE_TYPE_ALL:
+                    _devicetype = CL_DEVICE_TYPE_ALL;
+                    break;
+                default:
+                    CV_Error(-217,"Unkown device type");
+            }
+            int devcienums = 0;
+            // Platform info
+            cl_int status = 0;
+            cl_uint numPlatforms;
+            Info ocltmpinfo;
+            openCLSafeCall(clGetPlatformIDs(0, NULL, &numPlatforms));
+            CV_Assert(numPlatforms > 0);
+            cl_platform_id *platforms = new cl_platform_id[numPlatforms];
+
+            openCLSafeCall(clGetPlatformIDs(numPlatforms, platforms, NULL));
+            char deviceName[256];
+            for (unsigned i = 0; i < numPlatforms; ++i)
+            {
+                cl_uint numsdev;
+                status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
+                if(status != CL_DEVICE_NOT_FOUND)
+                {
+                    openCLVerifyCall(status);
+                }
+                if(numsdev > 0)
+                {
+                    devcienums += numsdev;
+                    cl_device_id *devices = new cl_device_id[numsdev];
+                    openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, devices, NULL));
+                    ocltmpinfo.impl->oclplatform = platforms[i];
+                    for(unsigned j = 0; j < numsdev; j++)
+                    {
+                        ocltmpinfo.impl->devices.push_back(devices[j]);
+                        openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 256, deviceName, NULL));
+                        ocltmpinfo.impl->devName.push_back(std::string(deviceName));
+                    }
+                    delete[] devices;
+                    oclinfo.push_back(ocltmpinfo);
+                    ocltmpinfo.release();
+                }
+            }
+            delete[] platforms;
+            if(devcienums > 0)
+            {
+                setDevice(oclinfo[0]);
+            }
+            return devcienums;
+        }
+        void setDevice(Info &oclinfo, int devnum)
+        {
+            CV_Assert(devnum >= 0);
+            cl_int status = 0;
+            cl_context_properties cps[3] =
+            {
+                CL_CONTEXT_PLATFORM, (cl_context_properties)(oclinfo.impl->oclplatform), 0
+            };
+            oclinfo.impl->devnum = devnum;
+            oclinfo.impl->oclcontext = clCreateContext(cps, 1, &oclinfo.impl->devices[devnum], NULL, NULL, &status);
+            openCLVerifyCall(status);
+            //create the command queue using the first device of the list
+            oclinfo.impl->clCmdQueue = clCreateCommandQueue(oclinfo.impl->oclcontext, oclinfo.impl->devices[devnum],
+                    CL_QUEUE_PROFILING_ENABLE, &status);
+            openCLVerifyCall(status);
+
+            //get device information
+            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                        sizeof(size_t), (void *)&oclinfo.impl->maxWorkGroupSize, NULL));
+            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                        sizeof(cl_uint), (void *)&oclinfo.impl->maxDimensions, NULL));
+            oclinfo.impl->maxWorkItemSizes = new size_t[oclinfo.impl->maxDimensions];
+            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                        sizeof(size_t)*oclinfo.impl->maxDimensions, (void *)oclinfo.impl->maxWorkItemSizes, NULL));
+            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS,
+                        sizeof(cl_uint), (void *)&oclinfo.impl->maxComputeUnits, NULL));
+            //initialize extra options for compilation. Currently only fp64 is included.
+            //Assume 4KB is enough to store all possible extensions.
+
+            const int EXT_LEN = 4096 + 1 ;
+            char extends_set[EXT_LEN];
+            size_t extends_size;
+            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_EXTENSIONS,
+                        EXT_LEN, (void *)extends_set, &extends_size));
+            CV_Assert(extends_size < EXT_LEN);
+            extends_set[EXT_LEN-1] = 0;
+            //oclinfo.extra_options = NULL;
+            int fp64_khr = string(extends_set).find("cl_khr_fp64");
+
+            if(fp64_khr >= 0 && fp64_khr < EXT_LEN)
+            {
+                sprintf(oclinfo.impl->extra_options , "-D DOUBLE_SUPPORT");
+                oclinfo.impl -> double_support = 1;
+            }
+            Context::setContext(oclinfo);
+        }
+
+        void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
+        {
+            cl_int status;
+            status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0,
+                                 size, host_buffer, 0, NULL, NULL);
+            openCLVerifyCall(status);
+        }
+
+        cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
+        {
+            cl_int status;
+            cl_mem buffer = clCreateBuffer(clCxt->impl->clContext,(cl_mem_flags)flag, size, NULL, &status);
+            openCLVerifyCall(status);
+            return buffer;
+        }
+
+        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
+                size_t widthInBytes, size_t height)
+        {
+            cl_int status;
+
+            *dev_ptr = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
+                    widthInBytes * height, 0, &status);
+            openCLVerifyCall(status);
+            *pitch = widthInBytes;
+        }
+
+        void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
+                const void *src, size_t spitch,
+                size_t width, size_t height, enum openCLMemcpyKind kind)
+        {
+            size_t buffer_origin[3] = {0, 0, 0};
+            size_t host_origin[3] = {0, 0, 0};
+            size_t region[3] = {width, height, 1};
+            if(kind == clMemcpyHostToDevice)
+            {
+                openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
+                            buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
+            }
+            else if(kind == clMemcpyDeviceToHost)
+            {
+                openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
+                            buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
+            }
+        }
+
+        void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
+                const void *src, size_t spitch,
+                size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind)
+        {
+            size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
+            size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
+            size_t region[3] = {width, height, 1};
+
+            openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
+                        region, spitch, 0, dpitch, 0, 0, 0, 0));
+        }
+
+        void openCLFree(void *devPtr)
+        {
+            openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
+        }
+        cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName)
+        {
+            return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
+        }
+
+        
+        void setBinpath(const char *path)
+        {
+			Context *clcxt = Context::getContext();
+			clcxt->impl->Binpath = path;
+        }
+        int savetofile(const Context *clcxt,  cl_program &program, const char *fileName)
+        {
+            cl_int status;
+            size_t numDevices = 1;
+            cl_device_id *devices = clcxt->impl->devices;
+            //figure out the sizes of each of the binaries.
+            size_t *binarySizes = (size_t *)malloc( sizeof(size_t) * numDevices );
+
+            openCLSafeCall(clGetProgramInfo(program,
+                    CL_PROGRAM_BINARY_SIZES,
+                    sizeof(size_t) * numDevices,
+                    binarySizes, NULL));
+
+            size_t i = 0;
+            //copy over all of the generated binaries.
+            char **binaries = (char **)malloc( sizeof(char *) * numDevices );
+            if(binaries == NULL)
+            {
+                CV_Error(-217,"Failed to allocate host memory.(binaries)\r\n");
+            }
+
+            for(i = 0; i < numDevices; i++)
+            {
+                if(binarySizes[i] != 0)
+                {
+                    binaries[i] = (char *)malloc( sizeof(char) * binarySizes[i]);
+                    if(binaries[i] == NULL)
+                    {
+                        CV_Error(-217,"Failed to allocate host memory.(binaries[i])\r\n");
+                    }
+                }
+                else
+                {
+                    binaries[i] = NULL;
+                }
+            }
+            openCLSafeCall(clGetProgramInfo(program,
+                    CL_PROGRAM_BINARIES,
+                    sizeof(char *) * numDevices,
+                    binaries,
+                    NULL));
+
+            //dump out each binary into its own separate file.
+            for(i = 0; i < numDevices; i++)
+            {
+                if(binarySizes[i] != 0)
+                {
+                    char deviceName[1024];
+                    openCLSafeCall(clGetDeviceInfo(devices[i],
+                            CL_DEVICE_NAME,
+                            sizeof(deviceName),
+                            deviceName,
+                            NULL));
+
+                    printf( "%s binary kernel: %s\n", deviceName, fileName);
+                    FILE *fp = fopen(fileName, "wb+");
+                    if(fp == NULL)
+                    {
+                        char *temp;
+                        sprintf(temp, "Failed to load kernel file : %s\r\n", fileName);
+                        CV_Error(-217, temp);
+                    }
+                    else
+                    {
+                        fwrite(binaries[i], binarySizes[i], 1, fp);
+                        free(binaries[i]);
+                        fclose(fp);
+                    }
+                }
+                else
+                {
+                    printf("Skipping %s since there is no binary data to write!\n",
+                            fileName);
+                }
+            }
+            free(binarySizes);
+            free(binaries);
+            return 1;
+        }
+
+
+        cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName,
+                const char *build_options)
+        {
+            cl_kernel kernel;
+            cl_program program ;
+            cl_int status = 0;
+            stringstream src_sign;
+            string srcsign;
+			string filename;
+            CV_Assert(programCache != NULL);
+
+            if(NULL != build_options)
+			{
+                src_sign << (int64)source << clCxt->impl->clContext << "_" << build_options;
+			}
+            else
+			{
+                src_sign << (int64)source << clCxt->impl->clContext;			
+			}
+            srcsign = src_sign.str();
+
+            program = NULL;
+            program = programCache->progLookup(srcsign);
+
+            if(!program)
+            {
+                //config build programs
+                char all_build_options[1024];
+                memset(all_build_options, 0, 1024);
+                char zeromem[512]={0};
+                if(0!=memcmp(clCxt -> impl->extra_options, zeromem,512))
+                    strcat(all_build_options, clCxt -> impl->extra_options);
+                strcat(all_build_options, " ");
+                if(build_options != NULL)
+                    strcat(all_build_options, build_options);
+				if(all_build_options != NULL)
+				{
+					filename = clCxt->impl->Binpath + "\\" + kernelName + "_" + clCxt->impl->devName + all_build_options + ".clb";
+				}
+				else
+				{
+					filename = clCxt->impl->Binpath + "\\" + kernelName + "_" + clCxt->impl->devName + ".clb";
+				}
+
+                FILE *fp;
+                fp = fopen(filename.c_str(), "rb");
+                if(fp == NULL || clCxt->impl->Binpath.size() == 0)    //we should genetate a binary file for the first time.
+                {
+                    program = clCreateProgramWithSource(
+                            clCxt->impl->clContext, 1, source, NULL, &status);
+                    openCLVerifyCall(status);
+                    status = clBuildProgram(program, 1, &(clCxt->impl->devices[0]), all_build_options, NULL, NULL);
+                    if(status == CL_SUCCESS && clCxt->impl->Binpath.size()) 
+						savetofile(clCxt, program, filename.c_str());
+                }
+                else
+                {
+                    fseek(fp, 0, SEEK_END);
+                    size_t binarySize = ftell(fp);
+                    fseek(fp, 0, SEEK_SET);
+                    char *binary = new char[binarySize];
+                    fread(binary, binarySize, 1, fp);
+                    fclose(fp);
+                    cl_int status = 0;
+                    program = clCreateProgramWithBinary(clCxt->impl->clContext,
+                            1,
+                            &(clCxt->impl->devices[0]),
+                            (const size_t *)&binarySize,
+                            (const unsigned char **)&binary,
+                            NULL,
+                            &status);
+                    openCLVerifyCall(status);
+                    status = clBuildProgram(program, 1, &(clCxt->impl->devices[0]), all_build_options, NULL, NULL);
+                }
+
+                if(status != CL_SUCCESS)
+                {
+                    if(status == CL_BUILD_PROGRAM_FAILURE)
+                    {
+                        cl_int logStatus;
+                        char *buildLog = NULL;
+                        size_t buildLogSize = 0;
+                        logStatus = clGetProgramBuildInfo(program,
+                                clCxt->impl->devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                buildLog, &buildLogSize);
+                        if(logStatus != CL_SUCCESS)
+                            cout << "Failed to build the program and get the build info." << endl;
+                        buildLog = new char[buildLogSize];
+                        CV_DbgAssert(!!buildLog);
+                        memset(buildLog, 0, buildLogSize);
+                        openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[0],
+                                    CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
+                        cout << "\n\t\t\tBUILD LOG\n";
+                        cout << buildLog << endl;
+                        delete buildLog;
+                    }
+                    openCLVerifyCall(status);
+                }
+                //Cache the binary for future use if build_options is null
+                if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE)
+                    programCache->addProgram(srcsign, program);
+                else 
+					cout << "Warning: code cache has been full.\n";
+            }
+            kernel = clCreateKernel(program, kernelName.c_str(), &status);
+            openCLVerifyCall(status);
+            return kernel;
+        }
+
+        void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize,
+                size_t *globalThreads, size_t *localThreads)
+        {
+            size_t kernelWorkGroupSize;
+            openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[0],
+                        CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+            CV_DbgAssert( (localThreads[0] <= clCxt->impl->maxWorkItemSizes[0]) &&
+                    (localThreads[1] <= clCxt->impl->maxWorkItemSizes[1]) &&
+                    (localThreads[2] <= clCxt->impl->maxWorkItemSizes[2]) &&
+                    ((localThreads[0] * localThreads[1] * localThreads[2]) <= kernelWorkGroupSize) &&
+                    (localThreads[0] * localThreads[1] * localThreads[2]) <= clCxt->impl->maxWorkGroupSize);
+        }
+
+#ifdef PRINT_KERNEL_RUN_TIME
+        static double total_execute_time = 0;
+        static double total_kernel_time = 0;
+#endif
+        void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
+                size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
+                int depth, char *build_options)
+        {
+            //construct kernel name
+            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+            stringstream idxStr;
+            if(channels != -1)
+                idxStr << "_C" << channels;
+            if(depth != -1)
+                idxStr << "_D" << depth;
+            kernelName += idxStr.str();
+
+            cl_kernel kernel;
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
+
+            globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
+            globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
+            globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+
+            size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+            cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads);
+
+            for(int i = 0; i < args.size(); i ++)
+                openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+#ifndef PRINT_KERNEL_RUN_TIME
+            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
+                        localThreads, 0, NULL, NULL));
+#else
+            cl_event event = NULL;
+            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
+                        localThreads, 0, NULL, &event));
+
+            cl_ulong start_time, end_time, queue_time;
+            double execute_time = 0;
+            double total_time   = 0;
+
+            openCLSafeCall(clWaitForEvents(1, &event));
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
+                        sizeof(cl_ulong), &start_time, 0));
+
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                        sizeof(cl_ulong), &end_time, 0));
+
+            openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+                        sizeof(cl_ulong), &queue_time, 0));
+
+            execute_time = (double)(end_time - start_time) / (1000 * 1000);
+            total_time = (double)(end_time - queue_time) / (1000 * 1000);
+
+            //	cout << setiosflags(ios::left) << setw(15) << execute_time;
+            //	cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
+            //	cout << setiosflags(ios::left) << setw(15) << total_time << endl;
+
+            total_execute_time += execute_time;
+            total_kernel_time += total_time;
+            clReleaseEvent(event);
+#endif
+
+            clFinish(clCxt->impl->clCmdQueue);
+            openCLSafeCall(clReleaseKernel(kernel));
+        }
+
+        void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
+                size_t globalThreads[3], size_t localThreads[3],
+                vector< pair<size_t, const void *> > &args, int channels, int depth)
+        {
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args,
+                    channels, depth, NULL);
+        }
+        void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
+                size_t globalThreads[3], size_t localThreads[3],
+                vector< pair<size_t, const void *> > &args, int channels, int depth, char *build_options)
+
+        {
+#ifndef PRINT_KERNEL_RUN_TIME
+            openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                    build_options);
+#else
+            string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
+            cout << endl;
+            cout << "Function Name: " << kernelName;
+            if(depth >= 0)
+                cout << " |data type: " << data_type[depth];
+            cout << " |channels: " << channels;
+            cout << " |Time Unit: " << "ms" << endl;
+
+            total_execute_time = 0;
+            total_kernel_time = 0;
+            cout << "-------------------------------------" << endl;
+
+            cout << setiosflags(ios::left) << setw(15) << "excute time";
+            cout << setiosflags(ios::left) << setw(15) << "lauch time";
+            cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
+            int i = 0;
+            for(i = 0; i < RUN_TIMES; i++)
+                openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                        build_options);
+
+            cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
+            cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
+#endif
+        }
+
+        cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+                const size_t size)
+        {
+            int status;
+            cl_mem con_struct;
+
+            con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
+            openCLSafeCall(status);
+
+            openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
+                        value, 0, 0, 0));
+
+            return con_struct;
+
+        }
+
+        /////////////////////////////OpenCL initialization/////////////////
+        auto_ptr<Context> Context::clCxt;
+        int Context::val = 0;
+        CriticalSection cs;
+        Context *Context::getContext()
+        {
+            if(val == 0)
+            {
+                AutoLock al(&cs);
+                if( NULL == clCxt.get())
+                    clCxt.reset(new Context);
+
+                val = 1;
+                return clCxt.get();
+            }
+            else
+            {
+                return clCxt.get();
+            }
+        }
+        void Context::setContext(Info &oclinfo)
+        {
+            Context *clcxt = getContext();
+            clcxt->impl->clContext = oclinfo.impl->oclcontext;
+            clcxt->impl->clCmdQueue = oclinfo.impl->clCmdQueue;
+            clcxt->impl->devices = &oclinfo.impl->devices[oclinfo.impl->devnum];
+			clcxt->impl->devName = oclinfo.impl->devName[oclinfo.impl->devnum];
+            clcxt->impl->maxDimensions = oclinfo.impl->maxDimensions;
+            clcxt->impl->maxWorkGroupSize = oclinfo.impl->maxWorkGroupSize;
+            clcxt->impl->maxWorkItemSizes = oclinfo.impl->maxWorkItemSizes;
+            clcxt->impl->maxComputeUnits = oclinfo.impl->maxComputeUnits;
+            clcxt->impl->double_support = oclinfo.impl->double_support;
+            //extra options to recognize compiler options
+            clcxt->impl->extra_options = oclinfo.impl->extra_options;
+        }
+        Context::Context()
+        {
+            impl = new Impl;
+            //Information of the OpenCL context
+            impl->clContext = NULL;
+            impl->clCmdQueue = NULL;
+            impl->devices = NULL;
+            impl->maxDimensions = 0;
+            impl->maxWorkGroupSize = 0;
+            impl->maxWorkItemSizes = NULL;
+            impl->maxComputeUnits = 0;
+            impl->double_support = 0;
+            //extra options to recognize vendor specific fp64 extensions
+            impl->extra_options = NULL;
+            programCache = ProgramCache::getProgramCache();
+        }
+
+        Context::~Context()
+        {
+            delete impl;
+            programCache->releaseProgram();
+        }
+        Info::Info()
+        {
+            impl = new Impl;
+            impl->oclplatform = 0;
+            impl->oclcontext = 0;
+            impl->clCmdQueue = 0;
+            impl->devnum = 0;
+            impl->maxDimensions = 0;
+            impl->maxWorkGroupSize = 0;
+            impl->maxWorkItemSizes = 0;
+            impl->maxComputeUnits = 0;
+            impl->double_support = 0;
+            //extra_options = 0;
+        }
+        void Info::release()
+        {
+            if(impl->oclplatform)
+            {
+                impl->oclplatform = 0;
+            }
+            if(impl->clCmdQueue)
+            {
+                openCLSafeCall(clReleaseCommandQueue(impl->clCmdQueue));
+            }
+            ProgramCache::getProgramCache()->releaseProgram();
+            if(impl->oclcontext)
+            {
+                openCLSafeCall(clReleaseContext(impl->oclcontext));
+            }
+            if(impl->maxWorkItemSizes)
+            {
+                delete[] impl->maxWorkItemSizes;
+                impl->maxWorkItemSizes = 0;
+            }
+            //if(extra_options)
+            //{
+            //	delete[] extra_options;
+            //	extra_options = 0;
+            //}
+            impl->devices.clear();
+            impl->devName.clear();
+        }
+        Info::~Info()
+        {
+            release();
+            delete impl;
+        }
+        Info &Info::operator = (const Info &m)
+        {
+            impl->oclplatform = m.impl->oclplatform;
+            impl->oclcontext = m.impl->oclcontext;
+            impl->clCmdQueue = m.impl->clCmdQueue;
+            impl->devnum = m.impl->devnum;
+            impl->maxDimensions = m.impl->maxDimensions;
+            impl->maxWorkGroupSize = m.impl->maxWorkGroupSize;
+            impl->maxWorkItemSizes = m.impl->maxWorkItemSizes;
+            impl->maxComputeUnits = m.impl->maxComputeUnits;
+            impl->double_support = m.impl->double_support;
+            memcpy(impl->extra_options, m.impl->extra_options, 512);
+            for(int i = 0; i < m.impl->devices.size(); i++)
+            {
+                impl->devices.push_back(m.impl->devices[i]);
+                impl->devName.push_back(m.impl->devName[i]);
+            }
+            return *this;
+        }
+        Info::Info(const Info &m)
+        {
+            impl = new Impl;
+            *this = m;
+        }
+    }//namespace ocl
+
+}//namespace cv
+#endif
--- a/modules/ocl/src/kernels/arithm_2_mat.cl
+++ b/modules/ocl/src/kernels/arithm_2_mat.cl
@ -0,0 +1,158 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+#define CV_PI   3.1415926535897932384626433832795
+
+char round_char(double v){
+    char v1=(char)v;
+    return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+unsigned char round_uchar(double v){
+    unsigned char v1=(unsigned char)v;
+    return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+short round_short(double v){
+    short v1=(short)v;
+    return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+unsigned short round_ushort(double v){
+    unsigned short v1=(unsigned short)v;
+    return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+int round_int(double v){
+    int v1=(int)v;
+    return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+
+char round2_char(double v){
+    char v1=(char)v;
+    if((v-v1)==0.5&&v1%2==0)
+        return v1;
+    else
+        return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+unsigned char round2_uchar(double v){
+    unsigned char v1=(unsigned char)v;
+    if((v-v1)==0.5&&v1%2==0)
+        return v1;
+    else
+        return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+short round2_short(double v){
+    short v1=(short)v;
+    if((v-v1)==0.5&&v1%2==0)
+        return v1;
+    else
+        return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+unsigned short round2_ushort(double v){
+    unsigned short v1=(unsigned short)v;
+    if((v-v1)==0.5&&v1%2==0)
+        return v1;
+    else
+        return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+int round2_int(double v){
+    int v1=(int)v;
+    if((v-v1)==0.5&&v1%2==0)
+        return v1;
+    else
+        return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
+}
+
+/*****************************************EXP***************************************/
+__kernel void arithm_op_exp_5 (int rows,int cols,int srcStep,__global float *src1Mat,
+                             __global float * dstMat,int channels)
+{
+    size_t x = get_global_id(0);
+    size_t y = get_global_id(1);
+    if (x < cols && y < rows)
+    {
+        size_t idx = y * ( srcStep >> 2 ) + x;
+        dstMat[idx] = (float)exp((float)src1Mat[idx]);
+    }
+}
+__kernel void arithm_op_exp_6 (int rows,int cols,int srcStep,__global double *src1Mat,
+                             __global double * dstMat,int channels)
+{
+    size_t x = get_global_id(0);
+    size_t y = get_global_id(1);
+    if (x < cols && y < rows)
+    {
+        size_t idx = y * ( srcStep >> 3 ) + x;
+        dstMat[idx] = exp(src1Mat[idx]);
+    }
+}
+
+/*****************************************LOG***************************************/
+__kernel void arithm_op_log_5 (int rows,int cols,int srcStep,__global float *src1Mat,
+                             __global float * dstMat,int channels)
+{
+    size_t x = get_global_id(0);
+    size_t y = get_global_id(1);
+    if (x < cols && y < rows)
+    {
+        size_t idx = y * ( srcStep >> 2 ) + x;
+        dstMat[idx] =(float) log((float)src1Mat[idx]);
+    }
+}
+__kernel void arithm_op_log_6 (int rows,int cols,int srcStep,__global double *src1Mat,
+                             __global double * dstMat,int channels)
+{
+    size_t x = get_global_id(0);
+    size_t y = get_global_id(1);
+    if (x < cols && y < rows)
+    {
+        size_t idx = y * ( srcStep >> 3 ) + x;
+        dstMat[idx] = log(src1Mat[idx]);
+    }
+}
--- a/modules/ocl/src/kernels/arithm_LUT.cl
+++ b/modules/ocl/src/kernels/arithm_LUT.cl
@ -0,0 +1,162 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Rock Li, Rock.li@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+__kernel
+void LUT_C1_D0( __global uchar *dst,
+	  __global const uchar *src,
+	  __constant uchar *table,
+	  int rows,
+	  int cols,
+	  int channels,
+	  int whole_rows,
+	  int whole_cols,
+	  int src_offset,
+	  int dst_offset,
+	  int lut_offset,
+	  int src_step,
+	  int dst_step)
+{
+	int gidx = get_global_id(0)<<2;
+	int gidy = get_global_id(1);
+	int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	__local uchar l[256];
+	l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
+	//mem_fence(CLK_LOCAL_MEM_FENCE);
+
+
+	//clamp(gidx,mask,cols-1);
+	gidx = gidx >= cols-4?cols-4:gidx;
+	gidy = gidy >= rows?rows-1:gidy;
+
+	int src_index = src_offset + mad24(gidy,src_step,gidx);	
+	int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
+	uchar4 p,q;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	p.x = src[src_index];
+	p.y = src[src_index+1];
+	p.z = src[src_index+2];
+	p.w = src[src_index+3];
+
+	q.x = l[p.x];
+	q.y = l[p.y];
+	q.z = l[p.z];
+	q.w = l[p.w];
+	*(__global uchar4*)(dst + dst_index) = q;
+}
+
+__kernel
+void LUT2_C1_D0( __global uchar *dst,
+	  __global const uchar *src,
+	  __constant uchar *table,
+	  int rows,
+	  int precols,
+	  int channels,
+	  int whole_rows,
+	  int cols,
+	  int src_offset,
+	  int dst_offset,
+	  int lut_offset,
+	  int src_step,
+	  int dst_step)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	//int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+	__local uchar l[256];
+	l[lidy] = table[lidy+lut_offset];
+	//mem_fence(CLK_LOCAL_MEM_FENCE);
+
+
+	//clamp(gidx,mask,cols-1);
+	gidx = gidx >= precols ? cols+gidx : gidx;
+	gidy = gidy >= rows?rows-1:gidy;
+
+	int src_index = src_offset + mad24(gidy,src_step,gidx);	
+	int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
+	//uchar4 p,q;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	uchar p = src[src_index];
+	uchar q = l[p];
+	dst[dst_index] = q;
+}
+
+__kernel
+void LUT_C4_D0( __global uchar4 *dst,
+	  __global uchar4 *src,
+	  __constant uchar *table,
+	  uint rows,
+	  uint cols,
+	  uint channels,
+	  uint whole_rows,
+	  uint whole_cols,
+	  uint src_offset,
+	  uint dst_offset,
+	  uint lut_offset,
+	  uint src_step,
+	  uint dst_step)
+{
+	uint gidx = get_global_id(0);
+	uint gidy = get_global_id(1);
+	
+	uint lidx = get_local_id(0);
+	uint lidy = get_local_id(1);
+
+	__local uchar l[256];
+	l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	
+	gidx = gidx >= cols?cols-1:gidx;
+	gidy = gidy >= rows?rows-1:gidy;
+
+	uint src_index = src_offset/4 + gidy * src_step/4 + gidx;
+	
+	uint dst_index = dst_offset/4 + gidy * dst_step/4 + gidx;
+
+	uchar4 p = src[src_index];
+	dst[dst_index].x = l[p.x];	
+	dst[dst_index].y = l[p.y];	
+	dst[dst_index].z = l[p.z];	
+	dst[dst_index].w = l[p.w];	
+}
--- a/modules/ocl/src/kernels/arithm_absdiff.cl
+++ b/modules/ocl/src/kernels/arithm_absdiff.cl
@ -0,0 +1,917 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////absdiff////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************adddiff *************************************/
+__kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                                 __global uchar *src2, int src2_step, int src2_offset,
+                                 __global uchar *dst,  int dst_step,  int dst_offset,
+                                 int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = abs_diff(src1_data, src2_data);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+__kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                                 __global ushort *src2, int src2_step, int src2_offset,
+                                 __global ushort *dst,  int dst_step,  int dst_offset,
+                                 int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+        ushort4 tmp_data = abs_diff(src1_data, src2_data);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+__kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_offset,
+                                 __global short *src2, int src2_step, int src2_offset,
+                                 __global short *dst,  int dst_step,  int dst_offset,
+                                 int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        short4  dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+        ushort4 tmp = abs_diff(src1_data, src2_data);
+        short4  tmp_data = convert_short4_sat(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_absdiff_D4 (__global int *src1, int src1_step, int src1_offset,
+                                 __global int *src2, int src2_step, int src2_offset,
+                                 __global int *dst,  int dst_step,  int dst_offset,
+                                 int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int data2 = *((__global int *)((__global char *)src2 + src2_index));
+        uint tmp = abs_diff(data1, data2);
+        int  tmp_data = convert_int_sat(tmp);
+
+        *((__global int *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+__kernel void arithm_absdiff_D5 (__global float *src1, int src1_step, int src1_offset,
+                                 __global float *src2, int src2_step, int src2_offset,
+                                 __global float *dst,  int dst_step,  int dst_offset,
+                                 int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float data2 = *((__global float *)((__global char *)src2 + src2_index));
+        float tmp = fabs(data1 - data2);
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_absdiff_D6 (__global double *src1, int src1_step, int src1_offset,
+                                 __global double *src2, int src2_step, int src2_offset,
+                                 __global double *dst,  int dst_step,  int dst_offset,
+                                 int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+        double tmp = fabs(data1-data2);
+
+        *((__global double *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+#endif
+
+/**************************************absdiff with scalar**************************************/
+__kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                      __global   uchar *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                      __global   ushort *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        ushort2 tmp_data = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                      __global   short *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+
+        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
+        short2 tmp_data = convert_short2_sat(tmp);
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                      __global   int *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
+
+        uint tmp_data = abs_diff(src_data1, src_data2);
+        int  data = convert_int_sat(tmp_data);
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                      __global   float *dst,  int dst_step,  int dst_offset,
+                                      float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float src_data2 = src2.x;
+        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
+
+        float data = fabs(src_data1 - src_data2);
+
+        *((__global float *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                     __global   double *dst,  int dst_step,  int dst_offset,
+                                     double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double src2_data = src2.x;
+        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
+
+        double data = fabs(src_data1 - src2_data);
+
+        *((__global double *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+
+__kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                      __global   uchar *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                      __global   ushort *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
+
+        ushort2 data = convert_ushort2_sat( abs_diff(convert_int2_sat(src_data1), src_data2));
+
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                      __global   short *dst,  int dst_step,  int dst_offset,
+                                     int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
+
+        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src_data1), src_data2));
+        short2 data = convert_short2_sat(tmp);
+
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                      __global   int *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
+
+        int2 data = convert_int2_sat(abs_diff(src_data1, src_data2));
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                      __global   float *dst,  int dst_step,  int dst_offset,
+                                     float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
+        float2 src_data2 = (float2)(src2.x, src2.y);
+        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
+
+        float2 data = fabs(src_data1 - src_data2);
+        *((__global float2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_absdiff_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                      __global   double *dst,  int dst_step,  int dst_offset,
+                                      double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
+        double2 src_data2 = (double2)(src2.x, src2.y);
+        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
+
+        double2 data = fabs(src_data1 - src_data2);
+
+        *((__global double2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+__kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                      __global   uchar *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        uchar4 tmp_data_0 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_0), src2_data_0));
+        uchar4 tmp_data_1 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_1), src2_data_1));
+        uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+__kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                      __global   ushort *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        ushort2 tmp_data_0 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
+        ushort2 tmp_data_1 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
+        ushort2 tmp_data_2 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                      __global   short *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        short2 tmp_data_0 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
+        short2 tmp_data_1 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
+        short2 tmp_data_2 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                      __global   int *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x;
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z;
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        int tmp_data_0 = convert_int_sat(abs_diff(src1_data_0, src2_data_0));
+        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
+        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+__kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                      __global   float *dst,  int dst_step,  int dst_offset,
+                                      float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
+        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
+        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
+                                             
+        float src2_data_0 = src2.x;
+        float src2_data_1 = src2.y;
+        float src2_data_2 = src2.z;
+
+        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
+        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
+        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
+
+        float tmp_data_0 = fabs(src1_data_0 - src2_data_0);
+        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
+        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
+
+       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                      __global   double *dst,  int dst_step,  int dst_offset,
+                                      double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
+        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
+        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
+                                               
+        double src2_data_0 = src2.x;
+        double src2_data_1 = src2.y;
+        double src2_data_2 = src2.z;
+
+        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
+        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
+        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
+
+        double tmp_data_0 = fabs(src1_data_0 - src2_data_0);
+        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
+        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
+
+       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+    }
+}
+#endif
+__kernel void arithm_s_absdiff_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                      __global   uchar *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+
+        uchar4 data = convert_uchar4_sat(abs_diff(convert_int4_sat(src_data1), src2));
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                      __global   ushort *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+
+        ushort4 data = convert_ushort4_sat(abs_diff(convert_int4_sat(src_data1), src2));
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                      __global   short *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+
+        short4 data = convert_short4_sat(abs_diff(convert_int4_sat(src_data1), src2));
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                      __global   int *dst,  int dst_step,  int dst_offset,
+                                      int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+
+        int4 data = convert_int4_sat(abs_diff(src_data1, src2));
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_absdiff_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                      __global   float *dst,  int dst_step,  int dst_offset,
+                                      float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
+
+        float4 data = fabs(src_data1 - src2);
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_absdiff_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                      __global   double *dst,  int dst_step,  int dst_offset,
+                                      double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
+
+        double4 data = fabs(src_data1 - src2);
+
+        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_add.cl
+++ b/modules/ocl/src/kernels/arithm_add.cl
--- a/modules/ocl/src/kernels/arithm_addWeighted.cl
+++ b/modules/ocl/src/kernels/arithm_addWeighted.cl
@ -0,0 +1,332 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+typedef double F;
+#else
+typedef float F;
+#endif
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int src1_offset,
+                           __global uchar *src2, F  beta, int src2_step,int src2_offset,
+                           F gama,
+                           __global uchar *dst,  int dst_step,int dst_offset,
+                           int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+    {
+
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+//        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
+         short4 tmp;
+        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
+        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
+        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
+        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+       // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+    }
+
+}
+
+
+
+__kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int src1_offset,
+                           __global ushort *src2, F beta, int src2_step,int src2_offset,
+                           F gama,
+                           __global ushort *dst,  int dst_step,int dst_offset,
+                           int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+    {
+
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+         int4 tmp;
+        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
+        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
+        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
+        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
+        ushort4 tmp_data = convert_ushort4_sat(tmp);
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+
+}
+
+
+__kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int src1_offset,
+                              __global short *src2, F beta, int src2_step,int src2_offset,
+                              F gama,
+                              __global short *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+    {
+
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+         int4 tmp;
+        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
+        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
+        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
+        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
+        short4 tmp_data = convert_short4_sat(tmp);
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+}
+
+
+__kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1_offset,
+                              __global int *src2, F beta, int src2_step,int src2_offset,
+                              F gama,
+                              __global int *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+    {
+            
+        x = x << 2;
+
+        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
+
+        #define dst_align ((dst_offset >> bitOfInt) & 3)
+
+        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); 
+        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); 
+       
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
+       // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+         float4 tmp;
+        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
+        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
+        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
+        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
+        int4 tmp_data = convert_int4_sat(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+}
+
+
+__kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int src1_offset,
+                              __global float *src2, F beta, int src2_step,int src2_offset,
+                              F gama,
+                              __global float *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+    {
+            
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 2) & 3)
+
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+       
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
+
+        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
+    //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+
+       // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
+         float4 tmp_data;
+        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
+        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
+        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
+        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
+       // float4 tmp_data = convert_float4(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void addWeighted_D6 (__global double *src1, F alpha,int src1_step,int src1_offset,
+                              __global double *src2, F beta, int src2_step,int src2_offset,
+                              F gama,
+                              __global double *dst,  int dst_step,int dst_offset,
+                              int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+    {
+            
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 3) & 3)
+
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+       
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
+
+        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index));
+        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
+      //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
+         double4 tmp_data;
+        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
+        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
+        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
+        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_add_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_add_scalar.cl
@ -0,0 +1,744 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+/**************************************add with scalar without mask**************************************/
+__kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp = convert_int4_sat(src1_data) + src2_data;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        int2    tmp = convert_int2_sat(src1_data) + src2_data;
+        ushort2 tmp_data = convert_ushort2_sat(tmp);
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+
+        int2    tmp = convert_int2_sat(src1_data) + src2_data;
+        short2 tmp_data = convert_short2_sat(tmp);
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
+
+        int data = convert_int_sat((long)src_data1 + (long)src_data2);
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float src_data2 = src2.x;
+        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
+
+        float data = src_data1 + src_data2;
+
+        *((__global float *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double src2_data = src2.x;
+        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
+
+        double data = src_data1 + src2_data;
+
+        *((__global double *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+
+__kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp = convert_int4_sat(src1_data) + src2_data;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) + src_data2;
+        ushort2 data = convert_ushort2_sat(tmp);
+
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) + src_data2;
+        short2 data = convert_short2_sat(tmp);
+
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
+
+        int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
+        float2 src_data2 = (float2)(src2.x, src2.y);
+        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
+
+        float2 data = src_data1 + src_data2;
+        *((__global float2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
+        double2 src_data2 = (double2)(src2.x, src2.y);
+        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
+
+        double2 data = src_data1 + src_data2;
+
+        *((__global double2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+__kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0);
+        uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1);
+        uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+__kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
+        ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
+        ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
+        short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
+        short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x;
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z;
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
+        int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
+        int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+__kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
+        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
+        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
+                                             
+        float src2_data_0 = src2.x;
+        float src2_data_1 = src2.y;
+        float src2_data_2 = src2.z;
+
+        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
+        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
+        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
+
+        float tmp_data_0 = src1_data_0 + src2_data_0;
+        float tmp_data_1 = src1_data_1 + src2_data_1;
+        float tmp_data_2 = src1_data_2 + src2_data_2;
+
+       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
+        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
+        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
+                                               
+        double src2_data_0 = src2.x;
+        double src2_data_1 = src2.y;
+        double src2_data_2 = src2.z;
+
+        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
+        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
+        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
+
+        double tmp_data_0 = src1_data_0 + src2_data_0;
+        double tmp_data_1 = src1_data_1 + src2_data_1;
+        double tmp_data_2 = src1_data_2 + src2_data_2;
+
+       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+    }
+}
+#endif
+__kernel void arithm_s_add_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+
+        uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2);
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+
+        ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2);
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+
+        short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2);
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+
+        int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2));
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
+
+        float4 data = src_data1 + src2;
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
+
+        double4 data = src_data1 + src2;
+
+        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_add_scalar_mask.cl
@ -0,0 +1,874 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+/**************************************add with scalar with mask**************************************/
+__kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
+        uchar4 mask_data = vload4(0, mask + mask_index);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp = convert_int4_sat(src1_data) + src2_data;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar  *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        uchar2  mask_data = vload2(0, mask + mask_index);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        int2    tmp = convert_int2_sat(src1_data) + src2_data;
+        ushort2 tmp_data = convert_ushort2_sat(tmp);
+
+        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
+        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        uchar2  mask_data = vload2(0, mask + mask_index);
+
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        int2    tmp = convert_int2_sat(src1_data) + src2_data;
+        short2 tmp_data = convert_short2_sat(tmp);
+
+        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
+        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
+                                            __global   int   *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
+
+        int data = convert_int_sat((long)src_data1 + (long)src_data2);
+        data = mask_data ? data : dst_data; 
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_add_with_mask_C1_D5 (__global   float   *src1, int src1_step, int src1_offset,
+                                            __global   float   *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float src_data2 = src2.x;
+        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
+
+        float data = src_data1 + src_data2;
+        data = mask_data ? data : dst_data; 
+
+        *((__global float *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_with_mask_C1_D6 (__global   double   *src1, int src1_step, int src1_offset,
+                                            __global   double   *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double src_data2 = src2.x;
+        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
+
+        double data = src_data1 + src_data2;
+        data = mask_data ? data : dst_data; 
+
+        *((__global double *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+__kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4   src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
+        uchar2 mask_data = vload2(0, mask + mask_index);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4  tmp = convert_int4_sat(src1_data) + src2_data;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
+        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y); 
+        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) + src_data2;
+        ushort2 data = convert_ushort2_sat(tmp);
+        data = mask_data ? data : dst_data; 
+
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y); 
+        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) + src_data2;
+        short2 data = convert_short2_sat(tmp);
+        data = mask_data ? data : dst_data; 
+
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                            __global   int *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
+
+        int2 data = convert_int2_sat(convert_long2_sat(src_data1) + convert_long2_sat(src_data2));
+        data = mask_data ? data : dst_data; 
+
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                            __global   float *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
+        float2 src_data2 = (float2)(src2.x, src2.y); 
+        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
+
+        float2 data = src_data1 + src_data2;
+        data = mask_data ? data : dst_data; 
+
+        *((__global float2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_with_mask_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                            __global   double *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
+        double2 src_data2 = (double2)(src2.x, src2.y); 
+        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
+
+        double2 data = src_data1 + src_data2;
+        data = mask_data ? data : dst_data; 
+
+        *((__global double2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+
+__kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
+        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
+
+        uchar4 mask_data = vload4(0, mask + mask_index);
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0);
+        uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1);
+        uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
+
+        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+__kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y); 
+        int2 src2_data_1 = (int2)(src2.z, src2.x); 
+        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+
+        uchar2 mask_data = vload2(0, mask + mask_index);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
+        ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
+        ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
+
+        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y); 
+        int2 src2_data_1 = (int2)(src2.z, src2.x); 
+        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+
+        uchar2 mask_data = vload2(0, mask + mask_index);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
+        short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
+        short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
+
+        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                            __global   int *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x; 
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z; 
+
+        uchar mask_data = * (mask + mask_index);
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
+        int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
+        int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
+
+        data_0 = mask_data ? tmp_data_0 : data_0;
+        data_1 = mask_data ? tmp_data_1 : data_1;
+        data_2 = mask_data ? tmp_data_2 : data_2;
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                            __global   float *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
+        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
+        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
+                                             
+        float src2_data_0 = src2.x; 
+        float src2_data_1 = src2.y;
+        float src2_data_2 = src2.z; 
+
+        uchar mask_data = * (mask + mask_index);
+
+        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
+        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
+        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
+
+        float tmp_data_0 = src1_data_0 + src2_data_0;
+        float tmp_data_1 = src1_data_1 + src2_data_1;
+        float tmp_data_2 = src1_data_2 + src2_data_2;
+
+        data_0 = mask_data ? tmp_data_0 : data_0;
+        data_1 = mask_data ? tmp_data_1 : data_1;
+        data_2 = mask_data ? tmp_data_2 : data_2;
+
+       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_with_mask_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                            __global   double *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar  *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
+        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
+        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
+                                               
+        double src2_data_0 = src2.x; 
+        double src2_data_1 = src2.y;
+        double src2_data_2 = src2.z; 
+
+        uchar mask_data = * (mask + mask_index);
+
+        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
+        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
+        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
+
+        double tmp_data_0 = src1_data_0 + src2_data_0;
+        double tmp_data_1 = src1_data_1 + src2_data_1;
+        double tmp_data_2 = src1_data_2 + src2_data_2;
+
+        data_0 = mask_data ? tmp_data_0 : data_0;
+        data_1 = mask_data ? tmp_data_1 : data_1;
+        data_2 = mask_data ? tmp_data_2 : data_2;
+
+       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+    }
+}
+#endif
+
+__kernel void arithm_s_add_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+
+        uchar4 data = convert_uchar4_sat(convert_int4_sat(src_data1) + src2);
+        data = mask_data ? data : dst_data; 
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
+
+        ushort4 data = convert_ushort4_sat(convert_int4_sat(src_data1) + src2);
+        data = mask_data ? data : dst_data; 
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
+
+        short4 data = convert_short4_sat(convert_int4_sat(src_data1) + src2);
+        data = mask_data ? data : dst_data; 
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                            __global   int *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
+
+        int4 data = convert_int4_sat(convert_long4_sat(src_data1) + convert_long4_sat(src2));
+        data = mask_data ? data : dst_data; 
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_add_with_mask_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                            __global   float *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
+        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
+
+        float4 data = src_data1 + src2;
+        data = mask_data ? data : dst_data; 
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_add_with_mask_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                            __global   double *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
+        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
+
+        double4 data = src_data1 + src2;
+        data = mask_data ? data : dst_data; 
+
+        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_bitwise_and.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and.cl
@ -0,0 +1,267 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************bitwise_and without mask**************************************/
+__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data & src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = vload4(0, src2 + src2_index);
+
+        char4 dst_data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data & src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global char4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global ushort *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+        ushort4 tmp_data = src1_data & src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global short *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+        short4 tmp_data = src1_data & src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global int *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int data2 = *((__global int *)((__global char *)src2 + src2_index));
+        int tmp  = data1 & data2;
+
+        *((__global int *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
+        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
+        char4 tmp = data1 & data2;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
+        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
+
+        *((__global char8 *)((__global char *)dst + dst_index)) = data1 & data2;
+    }
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
--- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl
@ -0,0 +1,907 @@
+////////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************and with scalar without mask**************************************/
+__kernel void arithm_s_bitwise_and_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data & src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_and_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
+
+        char4 data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data & src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_and_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort2 src2_data = (ushort2)(src2.x, src2.x);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        ushort2 tmp_data = src1_data & src2_data;
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        short2 src2_data = (short2)(src2.x, src2.x);
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+
+        short2 tmp_data = src1_data & src2_data;
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+
+        int data = src_data1 & src_data2;
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
+        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
+
+        char4 data  = *((__global char4 *)((__global char *)dst  + dst_index));
+        char4 tmp_data = src1_data & src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, int src1_offset,
+                                  __global short *dst,  int dst_step,  int dst_offset,
+                                  short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
+        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        
+        short4 tmp_data = src1_data & src2_data;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_and_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data & src2_data;
+        
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_and_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
+        
+        char4 data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data & src2_data;
+        
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_and_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
+
+        ushort2 data = src_data1 & src_data2;
+        
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        short2 src_data2 = (short2)(src2.x, src2.y);
+
+        short2 data = src_data1 & src_data2;
+        
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+
+        int2 data = src_data1 & src_data2;
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
+        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
+
+        char8 tmp_data = src1_data & src2_data;
+        
+        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
+      }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, int src1_offset,
+                                  __global short *dst,  int dst_step,  int dst_offset,
+                                  short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
+        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
+
+        short8 tmp_data = src1_data & src2_data;
+        
+        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_and_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
+        uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
+        uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
+        
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_and_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+
+        char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
+        char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
+        char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
+
+        char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0));
+        char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1));
+        char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global char4 *)(dst + dst_index + 0)) = data_0;
+        *((__global char4 *)(dst + dst_index + 4)) = data_1;
+        *((__global char4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+
+__kernel void arithm_s_bitwise_and_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
+        ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
+        ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
+        ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
+        ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_bitwise_and_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        short2 src2_data_0 = (short2)(src2.x, src2.y);
+        short2 src2_data_1 = (short2)(src2.z, src2.x);
+        short2 src2_data_2 = (short2)(src2.y, src2.z);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        short2 tmp_data_0 = src1_data_0 & src2_data_0;
+        short2 tmp_data_1 = src1_data_1 & src2_data_1;
+        short2 tmp_data_2 = src1_data_2 & src2_data_2;
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_bitwise_and_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x;
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z;
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        int tmp_data_0 = src1_data_0 & src2_data_0;
+        int tmp_data_1 = src1_data_1 & src2_data_1;
+        int tmp_data_2 = src1_data_2 & src2_data_2;
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+__kernel void arithm_s_bitwise_and_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
+        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
+        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
+                                             
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
+
+        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
+        char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
+        char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
+
+        char4 tmp_data_0 = src1_data_0 & src2_data_0;
+        char4 tmp_data_1 = src1_data_1 & src2_data_1;
+        char4 tmp_data_2 = src1_data_2 & src2_data_2;
+
+       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, int src1_offset,
+                                          __global short *dst,  int dst_step,  int dst_offset,
+                                          short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
+        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
+        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
+                                               
+        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
+
+        short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
+        short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
+        short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
+
+        short4 tmp_data_0 = src1_data_0 & src2_data_0;
+        short4 tmp_data_1 = src1_data_1 & src2_data_1;
+        short4 tmp_data_2 = src1_data_2 & src2_data_2;
+        
+       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_and_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+
+        uchar4 data = src_data1 & src2;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_and_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
+
+        char4 data = src_data1 & src2;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_and_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+
+        ushort4 data = src_data1 & src2;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+
+        short4 data = src_data1 & src2;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+
+        int4 data = src_data1 & src2;
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_and_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
+        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
+                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
+
+        char16 tmp_data = src1_data & src2_data;
+        
+        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, int src1_offset,
+                                          __global short *dst,  int dst_step,  int dst_offset,
+                                          short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
+        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
+        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
+        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
+
+        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
+        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
+        
+        short4 tmp_data_0 = src1_data_0 & src2_data_0;
+        short4 tmp_data_1 = src1_data_1 & src2_data_1;
+        short4 tmp_data_2 = src1_data_2 & src2_data_2;
+        short4 tmp_data_3 = src1_data_3 & src2_data_3;
+        
+       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+       
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
--- a/modules/ocl/src/kernels/arithm_bitwise_not.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_not.cl
@ -0,0 +1,251 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************bitwise_not without mask**************************************/
+__kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
+                                     int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = ~ src1_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+
+        char4 dst_data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = ~ src1_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global char4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+        ushort4 tmp_data = ~ src1_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+        short4 tmp_data = ~ src1_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int tmp  = ~ data1;
+
+        *((__global int *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+__kernel void arithm_bitwise_not_D5 (__global char *src, int src_step, int src_offset,
+                                     __global char *dst, int dst_step, int dst_offset,
+                                     int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, (x << 2) + src_offset);
+        int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
+
+        char4 data;
+
+        data = *((__global char4 *)((__global char *)src + src_index));
+        data = ~ data;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_offset,
+                                     __global char *dst, int dst_step, int dst_offset,
+                                     int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, (x << 3) + src_offset);
+        int dst_index = mad24(y, dst_step,  (x << 3) + dst_offset);
+         
+        char8 data;
+
+        data = *((__global char8 *)((__global char *)src + src_index));
+        data = ~ data;
+        
+        *((__global char8 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_bitwise_or.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or.cl
@ -0,0 +1,267 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************bitwise_or without mask**************************************/
+__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data | src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = vload4(0, src2 + src2_index);
+
+        char4 dst_data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data | src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global char4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global ushort *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+        ushort4 tmp_data = src1_data | src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global short *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+        short4 tmp_data = src1_data | src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global int *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int data2 = *((__global int *)((__global char *)src2 + src2_index));
+        int tmp  = data1 | data2;
+
+        *((__global int *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
+        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
+        char4 tmp = data1 | data2;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
+        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
+
+        *((__global char8 *)((__global char *)dst + dst_index)) = data1 | data2;
+    }
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
@ -0,0 +1,914 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************and with scalar without mask**************************************/
+__kernel void arithm_s_bitwise_or_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data | src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_or_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
+
+        char4 data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data | src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_or_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort2 src2_data = (ushort2)(src2.x, src2.x);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        ushort2 tmp_data = src1_data | src2_data;
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        short2 src2_data = (short2)(src2.x, src2.x);
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+
+        short2 tmp_data = src1_data | src2_data;
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+
+        int data = src_data1 | src_data2;
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
+        char4 src_data2 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
+
+        char4 data = src_data1 | src_data2;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
+                                  __global short *dst,  int dst_step,  int dst_offset,
+                                  short16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
+        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+
+        short4 tmp_data = src1_data | src2_data;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#endif
+
+__kernel void arithm_s_bitwise_or_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+                                 
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data | src2_data;
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_or_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
+
+        char4 data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data | src2_data;
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_or_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
+
+        ushort2 data = src_data1 | src_data2;
+
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        short2 src_data2 = (short2)(src2.x, src2.y);
+
+        short2 data = src_data1 | src_data2;
+
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+
+        int2 data = src_data1 | src_data2;
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
+        char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
+
+        char8 data = src_data1 | src_data2;
+        *((__global char8 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
+                                  __global short *dst,  int dst_step,  int dst_offset,
+                                  short16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
+        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
+
+        short8 tmp_data = src1_data & src2_data;
+
+        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_or_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        uchar4 tmp_data_0 =  src1_data_0  |  src2_data_0  ;
+        uchar4 tmp_data_1 =  src1_data_1  |  src2_data_1  ;
+        uchar4 tmp_data_2 =  src1_data_2  |  src2_data_2  ;
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_or_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+
+        char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
+        char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
+        char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
+
+        char4 tmp_data_0 =  src1_data_0  |  src2_data_0;
+        char4 tmp_data_1 =  src1_data_1  |  src2_data_1;
+        char4 tmp_data_2 =  src1_data_2  |  src2_data_2;
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global char4 *)(dst + dst_index + 0)) = data_0;
+        *((__global char4 *)(dst + dst_index + 4)) = data_1;
+        *((__global char4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+
+__kernel void arithm_s_bitwise_or_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+                                 
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
+        ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
+        ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        ushort2 tmp_data_0 = src1_data_0  |  src2_data_0  ;
+        ushort2 tmp_data_1 = src1_data_1  |  src2_data_1  ;
+        ushort2 tmp_data_2 = src1_data_2  |  src2_data_2  ;
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_bitwise_or_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        short2 src2_data_0 = (short2)(src2.x, src2.y);
+        short2 src2_data_1 = (short2)(src2.z, src2.x);
+        short2 src2_data_2 = (short2)(src2.y, src2.z);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        short2 tmp_data_0 =  src1_data_0  |  src2_data_0  ;
+        short2 tmp_data_1 =  src1_data_1  |  src2_data_1  ;
+        short2 tmp_data_2 =  src1_data_2  |  src2_data_2  ;
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_bitwise_or_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x;
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z;
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        int tmp_data_0 = src1_data_0 | src2_data_0;
+        int tmp_data_1 = src1_data_1 | src2_data_1;
+        int tmp_data_2 = src1_data_2 | src2_data_2;
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+__kernel void arithm_s_bitwise_or_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
+        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
+        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
+                                             
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
+
+        char4 tmp_data_0 = src1_data_0 | src2_data_0;
+        char4 tmp_data_1 = src1_data_1 | src2_data_1;
+        char4 tmp_data_2 = src1_data_2 | src2_data_2;
+
+       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
+                                          __global short *dst,  int dst_step,  int dst_offset,
+                                          short16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
+        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
+        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
+                                               
+        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
+
+        short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
+        short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
+        short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
+
+        short4 tmp_data_0 = src1_data_0 | src2_data_0;
+        short4 tmp_data_1 = src1_data_1 | src2_data_1;
+        short4 tmp_data_2 = src1_data_2 | src2_data_2;
+        
+       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_or_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+
+        uchar4 data = src_data1 | src2;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_or_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
+
+        char4 data = src_data1 | src2;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_or_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+
+        ushort4 data = src_data1 | src2;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+
+        short4 data = src_data1 | src2;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+
+        int4 data = src_data1 | src2;
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_or_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
+        char16 src_data2 = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
+                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
+
+        char16 data = src_data1 | src_data2;
+
+        *((__global char16 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
+                                          __global short *dst,  int dst_step,  int dst_offset,
+                                          short16 src2, int rows, int cols, int dst_step1)
+                                  
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
+        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
+        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
+        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
+
+        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
+        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
+        
+        short4 tmp_data_0 = src1_data_0 | src2_data_0;
+        short4 tmp_data_1 = src1_data_1 | src2_data_1;
+        short4 tmp_data_2 = src1_data_2 | src2_data_2;
+        short4 tmp_data_3 = src1_data_3 | src2_data_3;
+        
+       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+       
+    }
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
--- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor.cl
@ -0,0 +1,267 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************bitwise_xor without mask**************************************/
+__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data ^ src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = vload4(0, src2 + src2_index);
+
+        char4 dst_data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data ^ src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global char4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global ushort *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+        ushort4 tmp_data = src1_data ^ src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global short *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+        short4 tmp_data = src1_data ^ src2_data;
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global int *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int data2 = *((__global int *)((__global char *)src2 + src2_index));
+        int tmp  = data1 ^ data2;
+
+        *((__global int *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
+        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
+        char4 tmp = data1 ^ data2;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
+                             __global char *src2, int src2_step, int src2_offset,
+                             __global char *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
+        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
+
+        *((__global char8 *)((__global char *)dst + dst_index)) = data1 ^ data2;
+    }
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl
@ -0,0 +1,907 @@
+////////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************xor with scalar without mask**************************************/
+__kernel void arithm_s_bitwise_xor_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data ^ src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_xor_C1_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
+
+        char4 data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data ^ src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_xor_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort2 src2_data = (ushort2)(src2.x, src2.x);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        ushort2 tmp_data = src1_data ^ src2_data;
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        short2 src2_data = (short2)(src2.x, src2.x);
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+
+        short2 tmp_data = src1_data ^ src2_data;
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+
+        int data = src_data1 ^ src_data2;
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C1_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
+        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
+
+        char4 data  = *((__global char4 *)((__global char *)dst  + dst_index));
+        char4 tmp_data = src1_data ^ src2_data;
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global char4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, int src1_offset,
+                                  __global short *dst,  int dst_step,  int dst_offset,
+                                  short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
+        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        
+        short4 tmp_data = src1_data ^ src2_data;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_xor_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = src1_data ^ src2_data;
+        
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_xor_C2_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        char4 src1_data = vload4(0, src1 + src1_index);
+        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
+        
+        char4 data = *((__global char4 *)(dst + dst_index));
+        char4 tmp_data = src1_data ^ src2_data;
+        
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_xor_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
+
+        ushort2 data = src_data1 ^ src_data2;
+        
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        short2 src_data2 = (short2)(src2.x, src2.y);
+
+        short2 data = src_data1 ^ src_data2;
+        
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+
+        int2 data = src_data1 ^ src_data2;
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C2_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
+        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
+
+        char8 tmp_data = src1_data ^ src2_data;
+        
+        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
+      }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, int src1_offset,
+                                  __global short *dst,  int dst_step,  int dst_offset,
+                                  short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
+        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
+
+        short8 tmp_data = src1_data ^ src2_data;
+        
+        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_xor_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); 
+        uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
+        uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
+        
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_xor_C3_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); 
+        char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
+        char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); 
+
+        char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
+        char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
+        char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
+
+        char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global char4 *)(dst + dst_index + 0)) = data_0;
+        *((__global char4 *)(dst + dst_index + 4)) = data_1;
+        *((__global char4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+
+__kernel void arithm_s_bitwise_xor_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
+        ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
+        ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        short2 src2_data_0 = (short2)(src2.x, src2.y);
+        short2 src2_data_1 = (short2)(src2.z, src2.x);
+        short2 src2_data_2 = (short2)(src2.y, src2.z);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        short2 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        short2 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        short2 tmp_data_2 = src1_data_2 ^ src2_data_2;
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x;
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z;
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        int tmp_data_0 = src1_data_0 ^ src2_data_0;
+        int tmp_data_1 = src1_data_1 ^ src2_data_1;
+        int tmp_data_2 = src1_data_2 ^ src2_data_2;
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C3_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
+        char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
+        char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
+                                             
+        char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); 
+        char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); 
+
+        char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
+        char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
+        char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
+
+        char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
+
+       *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, int src1_offset,
+                                          __global short *dst,  int dst_step,  int dst_offset,
+                                          short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
+        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
+        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
+                                               
+        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
+
+        short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
+        short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
+        short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
+
+        short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
+        
+       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+    }
+}
+#endif
+__kernel void arithm_s_bitwise_xor_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  uchar4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+
+        uchar4 data = src_data1 ^ src2;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+
+
+__kernel void arithm_s_bitwise_xor_C4_D1 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
+
+        char4 data = src_data1 ^ src2;
+
+        *((__global char4 *)(dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_bitwise_xor_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  ushort4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+
+        ushort4 data = src_data1 ^ src2;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  short4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+
+        short4 data = src_data1 ^ src2;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+
+        int4 data = src_data1 ^ src2;
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_bitwise_xor_C4_D5 (__global   char *src1, int src1_step, int src1_offset,
+                                  __global   char *dst,  int dst_step,  int dst_offset,
+                                  char16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
+        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
+                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
+
+        char16 tmp_data = src1_data ^ src2_data;
+        
+        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, int src1_offset,
+                                          __global short *dst,  int dst_step,  int dst_offset,
+                                          short16 src2, int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
+        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
+        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
+        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
+
+        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
+        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
+        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
+        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
+        
+        short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
+        short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
+        short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
+        short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
+        
+       *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+       *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
+       
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
--- a/modules/ocl/src/kernels/arithm_cartToPolar.cl
+++ b/modules/ocl/src/kernels/arithm_cartToPolar.cl
@ -0,0 +1,132 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#define CV_PI   3.1415926535897932384626433832795
+
+__kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
+                                     __global float *src2, int src2_step, int src2_offset,
+                                     __global float *dst1, int dst1_step, int dst1_offset, //magnitude
+                                     __global float *dst2, int dst2_step, int dst2_offset, //cartToPolar
+                                     int rows, int cols, int angInDegree)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+
+        int dst1_index = mad24(y, dst1_step, (x << 2) + dst1_offset);
+        int dst2_index = mad24(y, dst2_step, (x << 2) + dst2_offset);
+
+        float x = *((__global float *)((__global char *)src1 + src1_index));
+        float y = *((__global float *)((__global char *)src2 + src2_index));
+
+        float x2 = x * x;
+        float y2 = y * y;
+
+        float magnitude = sqrt(x2 + y2);
+        float cartToPolar;
+
+        float tmp = y >= 0 ? 0 : CV_PI*2;
+        tmp = x < 0 ? CV_PI : tmp;
+
+        float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5;
+        cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + (float)DBL_EPSILON)  + tmp :
+                                 tmp1 - x*y/(y2 + 0.28f*x2 + (float)DBL_EPSILON);
+
+        cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI);
+
+        *((__global float *)((__global char *)dst1 + dst1_index)) = magnitude;
+        *((__global float *)((__global char *)dst2 + dst2_index)) = cartToPolar;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_cartToPolar_D6 (__global double *src1, int src1_step, int src1_offset,
+                                     __global double *src2, int src2_step, int src2_offset,
+                                     __global double *dst1, int dst1_step, int dst1_offset,
+                                     __global double *dst2, int dst2_step, int dst2_offset,
+                                     int rows, int cols, int angInDegree)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+
+        int dst1_index = mad24(y, dst1_step, (x << 3) + dst1_offset);
+        int dst2_index = mad24(y, dst2_step, (x << 3) + dst2_offset);
+
+        double x = *((__global double *)((__global char *)src1 + src1_index));
+        double y = *((__global double *)((__global char *)src2 + src2_index));
+
+        double x2 = x * x;
+        double y2 = y * y;
+
+        double magnitude = sqrt(x2 + y2);
+        double cartToPolar;
+
+        float tmp = y >= 0 ? 0 : CV_PI*2;
+        tmp = x < 0 ? CV_PI : tmp;
+
+        float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5;
+        cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + (float)DBL_EPSILON)  + tmp :
+                                 tmp1 - x*y/(y2 + 0.28f*x2 + (float)DBL_EPSILON);
+
+        cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI);
+
+        *((__global double *)((__global char *)dst1 + dst1_index)) = magnitude;
+        *((__global double *)((__global char *)dst2 + dst2_index)) = cartToPolar;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_compare_eq.cl
+++ b/modules/ocl/src/kernels/arithm_compare_eq.cl
@ -0,0 +1,691 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////Compare EQ////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {   
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 3) & 3)
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+#endif
+
+/***********************************Compare GT**************************/
+__kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 3) & 3)
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+#endif
+
+/***********************************Compare GE**************************/
+__kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1)& 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 2)& 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 2)& 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 3)& 3)
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_compare_ne.cl
+++ b/modules/ocl/src/kernels/arithm_compare_ne.cl
@ -0,0 +1,688 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+/***********************************Compare NE*******************************/
+__kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1)& 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1)& 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2)& 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 3) & 3)
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+#endif
+
+   
+/***********************************Compare LT*******************************/
+__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global  uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2) & 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 3) & 3)
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+#endif
+
+/***********************************Compare LE*******************************/
+__kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+
+
+__kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2)& 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 2)& 3)
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+        #define dst_align ((dst_offset >> 3)& 3)
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+#endif
+
+
--- a/modules/ocl/src/kernels/arithm_div.cl
+++ b/modules/ocl/src/kernels/arithm_div.cl
@ -0,0 +1,446 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+uchar round2_uchar(double v){
+
+    uchar v1 = convert_uchar_sat(v);
+    uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
+
+    return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+}
+
+ushort round2_ushort(double v){
+
+    ushort v1 = convert_ushort_sat(v);
+    ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
+
+    return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+}
+short round2_short(double v){
+
+    short v1 = convert_short_sat(v);
+    short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
+
+    return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+}
+int round2_int(double v){
+
+    int v1 = convert_int_sat(v);
+    int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
+
+    return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+}
+///////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////divide///////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
+/**********************************div*********************************************/
+__kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
+
+        double4 tmp      = convert_double4(src1_data) * scalar;
+
+        uchar4 tmp_data;
+        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (double)src2_data.x);
+        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (double)src2_data.y);
+        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (double)src2_data.z);
+        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (double)src2_data.w);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global ushort *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+
+        double4 tmp   = convert_double4(src1_data) * scalar;
+
+        ushort4 tmp_data;
+        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (double)src2_data.x);
+        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (double)src2_data.y);
+        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (double)src2_data.z);
+        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (double)src2_data.w);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global short *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+
+        double4 tmp   = convert_double4(src1_data) * scalar;
+
+        short4 tmp_data;
+        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (double)src2_data.x);
+        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (double)src2_data.y);
+        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (double)src2_data.z);
+        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (double)src2_data.w);
+
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global int *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int data2 = *((__global int *)((__global char *)src2 + src2_index));
+
+        double tmp  = convert_double(data1) * scalar;
+        int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_double)(data2));
+
+        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
+    }
+}
+
+__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global float *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float data2 = *((__global float *)((__global char *)src2 + src2_index));
+
+        double tmp  = convert_double(data1) * scalar;
+        float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_double)(data2));
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+
+__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global double *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+
+        double tmp  = data1 * scalar;
+        double tmp_data = (tmp == 0 || data2 == 0) ? 0 : (tmp / data2);
+
+        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+/************************************div with scalar************************************/
+__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset,
+                               __global uchar *dst,  int dst_step,  int dst_offset,
+                               int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src_index = mad24(y, src_step, x + src_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src_data = vload4(0, src + src_index);
+        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
+
+        uchar4 tmp_data;
+        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (double)src_data.x);
+        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (double)src_data.y);
+        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (double)src_data.z);
+        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (double)src_data.w);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset,
+                               __global ushort *dst,  int dst_step,  int dst_offset,
+                               int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src_data = vload4(0, (__global ushort *)((__global char *)src + src_index));
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+
+        ushort4 tmp_data;
+        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (double)src_data.x);
+        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (double)src_data.y);
+        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (double)src_data.z);
+        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (double)src_data.w);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset,
+                               __global short *dst,  int dst_step,  int dst_offset,
+                               int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src_data = vload4(0, (__global short *)((__global char *)src + src_index));
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+
+        short4 tmp_data;
+        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (double)src_data.x);
+        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (double)src_data.y);
+        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (double)src_data.z);
+        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (double)src_data.w);
+
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
+                               __global int *dst,  int dst_step,  int dst_offset,
+                               int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, (x << 2) + src_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data = *((__global int *)((__global char *)src + src_index));
+
+        int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_double)(data));
+
+        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
+    }
+}
+
+__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset,
+                               __global float *dst,  int dst_step,  int dst_offset,
+                               int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, (x << 2) + src_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data = *((__global float *)((__global char *)src + src_index));
+
+        float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_double)(data));
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+
+__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset,
+                               __global double *dst,  int dst_step,  int dst_offset,
+                               int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index = mad24(y, src_step, (x << 3) + src_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data = *((__global double *)((__global char *)src + src_index));
+
+        double tmp_data = (scalar == 0 || data == 0) ? 0 : (scalar / data);
+
+        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+}
+
+
--- a/modules/ocl/src/kernels/arithm_exp.cl
+++ b/modules/ocl/src/kernels/arithm_exp.cl
@ -0,0 +1,89 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wu Zailong, bullet@yeah.net
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////EXP//////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+      x = x << 2;
+      int srcIdx = mad24( y, srcStep, x + srcOffset);
+      int dstIdx = mad24( y, dstStep, x + dstOffset);
+
+      float src_data = *((__global float *)((__global char *)src + srcIdx));
+      float dst_data = exp(src_data);
+
+      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+
+    }
+}
+__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+{
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  if(x < cols && y < rows )
+  {
+      x = x << 3;
+      int srcIdx = mad24( y, srcStep, x + srcOffset);
+      int dstIdx = mad24( y, dstStep, x + dstOffset);
+
+      double src_data = *((__global double *)((__global char *)src + srcIdx));
+      double dst_data = exp(src_data);
+      
+      *((__global double *)((__global char *)dst + dstIdx )) = dst_data;
+     // dst[dstIdx] = exp(src[srcIdx]);
+  }
+}
--- a/modules/ocl/src/kernels/arithm_flip.cl
+++ b/modules/ocl/src/kernels/arithm_flip.cl
@ -0,0 +1,992 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////flip rows/////////////////////////////////////////////// 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_offset,
+                                   __global uchar *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align); 
+        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); 
+
+        int dst_start_0  = mad24(y,            dst_step, dst_offset);
+        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
+        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
+        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
+        int dst_index_0  = mad24(y,            dst_step, dst_offset + x & (int)0xfffffffc);
+        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src_data_0 = vload4(0, src + src_index_0);
+        uchar4 src_data_1 = vload4(0, src + src_index_1);
+
+        uchar4 dst_data_0 = *((__global uchar4 *)(dst + dst_index_0));
+        uchar4 dst_data_1 = *((__global uchar4 *)(dst + dst_index_1));
+
+        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
+        dst_data_0.y = ((dst_index_0 + 1 >= dst_start_0) && (dst_index_0 + 1 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
+        dst_data_0.z = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
+        dst_data_0.w =  (dst_index_0 + 3 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
+
+        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
+        dst_data_1.y = ((dst_index_1 + 1 >= dst_start_1) && (dst_index_1 + 1 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
+        dst_data_1.z = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
+        dst_data_1.w =  (dst_index_1 + 3 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
+
+        *((__global uchar4 *)(dst + dst_index_0)) = dst_data_0;
+        *((__global uchar4 *)(dst + dst_index_1)) = dst_data_1;
+    }
+}
+__kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_offset,
+                                   __global char *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align); 
+        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); 
+
+        int dst_start_0  = mad24(y,            dst_step, dst_offset);
+        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
+        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
+        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
+        int dst_index_0  = mad24(y,            dst_step, dst_offset + x & (int)0xfffffffc);
+        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        char4 src_data_0 = vload4(0, src + src_index_0);
+        char4 src_data_1 = vload4(0, src + src_index_1);
+
+        char4 dst_data_0 = *((__global char4 *)(dst + dst_index_0));
+        char4 dst_data_1 = *((__global char4 *)(dst + dst_index_1));
+
+        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
+        dst_data_0.y = ((dst_index_0 + 1 >= dst_start_0) && (dst_index_0 + 1 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
+        dst_data_0.z = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
+        dst_data_0.w =  (dst_index_0 + 3 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
+
+        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
+        dst_data_1.y = ((dst_index_1 + 1 >= dst_start_1) && (dst_index_1 + 1 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
+        dst_data_1.z = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
+        dst_data_1.w =  (dst_index_1 + 3 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
+
+        *((__global char4 *)(dst + dst_index_0)) = dst_data_0;
+        *((__global char4 *)(dst + dst_index_1)) = dst_data_1;
+    }
+}
+__kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_offset,
+                                   __global ushort *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset >> 1) & 3) << 1)
+        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align); 
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); 
+
+        int dst_start_0  = mad24(y,            dst_step, dst_offset);
+        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
+        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
+        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
+        int dst_index_0  = mad24(y,            dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)src + src_index_0));
+        ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)src + src_index_1));
+
+        ushort4 dst_data_0 = *((__global ushort4 *)((__global char *)dst + dst_index_0));
+        ushort4 dst_data_1 = *((__global ushort4 *)((__global char *)dst + dst_index_1));
+
+        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
+        dst_data_0.y = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
+        dst_data_0.z = ((dst_index_0 + 4 >= dst_start_0) && (dst_index_0 + 4 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
+        dst_data_0.w =  (dst_index_0 + 6 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
+
+        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
+        dst_data_1.y = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
+        dst_data_1.z = ((dst_index_1 + 4 >= dst_start_1) && (dst_index_1 + 4 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
+        dst_data_1.w =  (dst_index_1 + 6 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index_0)) = dst_data_0;
+        *((__global ushort4 *)((__global char *)dst + dst_index_1)) = dst_data_1;
+    }
+}
+__kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_offset,
+                                   __global short *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset >> 1) & 3) << 1)
+        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align); 
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); 
+
+        int dst_start_0  = mad24(y,            dst_step, dst_offset);
+        int dst_start_1  = mad24(rows - y - 1, dst_step, dst_offset);
+        int dst_end_0    = mad24(y,            dst_step, dst_offset + dst_step1);
+        int dst_end_1    = mad24(rows - y - 1, dst_step, dst_offset + dst_step1);
+        int dst_index_0  = mad24(y,            dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+        int dst_index_1  = mad24(rows - y - 1, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src_data_0 = vload4(0, (__global short *)((__global char *)src + src_index_0));
+        short4 src_data_1 = vload4(0, (__global short *)((__global char *)src + src_index_1));
+
+        short4 dst_data_0 = *((__global short4 *)((__global char *)dst + dst_index_0));
+        short4 dst_data_1 = *((__global short4 *)((__global char *)dst + dst_index_1));
+
+        dst_data_0.x =  (dst_index_0 + 0 >= dst_start_0)                                   ? src_data_1.x : dst_data_0.x;
+        dst_data_0.y = ((dst_index_0 + 2 >= dst_start_0) && (dst_index_0 + 2 < dst_end_0)) ? src_data_1.y : dst_data_0.y;
+        dst_data_0.z = ((dst_index_0 + 4 >= dst_start_0) && (dst_index_0 + 4 < dst_end_0)) ? src_data_1.z : dst_data_0.z;
+        dst_data_0.w =  (dst_index_0 + 6 < dst_end_0)                                      ? src_data_1.w : dst_data_0.w;
+
+        dst_data_1.x =  (dst_index_1 + 0 >= dst_start_1)                                   ? src_data_0.x : dst_data_1.x;
+        dst_data_1.y = ((dst_index_1 + 2 >= dst_start_1) && (dst_index_1 + 2 < dst_end_1)) ? src_data_0.y : dst_data_1.y;
+        dst_data_1.z = ((dst_index_1 + 4 >= dst_start_1) && (dst_index_1 + 4 < dst_end_1)) ? src_data_0.z : dst_data_1.z;
+        dst_data_1.w =  (dst_index_1 + 6 < dst_end_1)                                      ? src_data_0.w : dst_data_1.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index_0)) = dst_data_0;
+        *((__global short4 *)((__global char *)dst + dst_index_1)) = dst_data_1;
+    }
+}
+
+__kernel void arithm_flip_rows_D4 (__global int *src, int src_step, int src_offset,
+                                   __global int *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2) + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2) + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
+
+        int data0 = *((__global int *)((__global char *)src + src_index_0));
+        int data1 = *((__global int *)((__global char *)src + src_index_1));
+
+        *((__global int *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rows_D5 (__global float *src, int src_step, int src_offset,
+                                   __global float *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2) + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2) + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 2) + dst_offset);
+
+        float data0 = *((__global float *)((__global char *)src + src_index_0));
+        float data1 = *((__global float *)((__global char *)src + src_index_1));
+
+        *((__global float *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_rows_D6 (__global double *src, int src_step, int src_offset,
+                                   __global double *dst, int dst_step, int dst_offset,
+                                   int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 3) + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (x << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 3) + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (x << 3) + dst_offset);
+
+        double data0 = *((__global double *)((__global char *)src + src_index_0));
+        double data1 = *((__global double *)((__global char *)src + src_index_1));
+
+        *((__global double *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////flip cols/////////////////////////////////////////////// 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void arithm_flip_cols_C1_D0 (__global uchar *src, int src_step, int src_offset,
+                                      __global uchar *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
+        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
+
+        uchar data0 = *(src + src_index_0);
+        uchar data1 = *(src + src_index_1);
+
+        *(dst + dst_index_0) = data1;
+        *(dst + dst_index_1) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C1_D1 (__global char *src, int src_step, int src_offset,
+                                      __global char *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x)           + src_offset);
+        int src_index_1 = mad24(y, src_step, (cols - x -1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x)           + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, (cols - x -1) + dst_offset);
+
+        char data0 = *(src + src_index_0);
+        char data1 = *(src + src_index_1);
+
+        *(dst + dst_index_0) = data1;
+        *(dst + dst_index_1) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C1_D2 (__global ushort *src, int src_step, int src_offset,
+                                      __global ushort *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        ushort data0 = *((__global ushort *)((__global char *)src + src_index_0));
+        ushort data1 = *((__global ushort *)((__global char *)src + src_index_1));
+
+        *((__global ushort *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global ushort *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C1_D3 (__global short *src, int src_step, int src_offset,
+                                      __global short *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        short data0 = *((__global short *)((__global char *)src + src_index_0));
+        short data1 = *((__global short *)((__global char *)src + src_index_1));
+
+        *((__global short *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global short *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C1_D4 (__global int *src, int src_step, int src_offset,
+                                      __global int *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        int data0 = *((__global int *)((__global char *)src + src_index_0));
+        int data1 = *((__global int *)((__global char *)src + src_index_1));
+
+        *((__global int *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C1_D5 (__global float *src, int src_step, int src_offset,
+                                      __global float *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        float data0 = *((__global float *)((__global char *)src + src_index_0));
+        float data1 = *((__global float *)((__global char *)src + src_index_1));
+
+        *((__global float *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_cols_C1_D6 (__global double *src, int src_step, int src_offset,
+                                      __global double *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        double data0 = *((__global double *)((__global char *)src + src_index_0));
+        double data1 = *((__global double *)((__global char *)src + src_index_1));
+
+        *((__global double *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
+__kernel void arithm_flip_cols_C2_D0 (__global uchar *src, int src_step, int src_offset,
+                                      __global uchar *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        uchar2 data0 = *((__global uchar2 *)((__global char *)src + src_index_0));
+        uchar2 data1 = *((__global uchar2 *)((__global char *)src + src_index_1));
+
+        *((__global uchar2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global uchar2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C2_D1 (__global char *src, int src_step, int src_offset,
+                                      __global char *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        char2 data0 = *((__global char2 *)((__global char *)src + src_index_0));
+        char2 data1 = *((__global char2 *)((__global char *)src + src_index_1));
+
+        *((__global char2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global char2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C2_D2 (__global ushort *src, int src_step, int src_offset,
+                                      __global ushort *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        ushort2 data0 = *((__global ushort2 *)((__global char *)src + src_index_0));
+        ushort2 data1 = *((__global ushort2 *)((__global char *)src + src_index_1));
+
+        *((__global ushort2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global ushort2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C2_D3 (__global short *src, int src_step, int src_offset,
+                                      __global short *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        short2 data0 = *((__global short2 *)((__global char *)src + src_index_0));
+        short2 data1 = *((__global short2 *)((__global char *)src + src_index_1));
+
+        *((__global short2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global short2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C2_D4 (__global int *src, int src_step, int src_offset,
+                                      __global int *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        int2 data0 = *((__global int2 *)((__global char *)src + src_index_0));
+        int2 data1 = *((__global int2 *)((__global char *)src + src_index_1));
+
+        *((__global int2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C2_D5 (__global float *src, int src_step, int src_offset,
+                                      __global float *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        float2 data0 = *((__global float2 *)((__global char *)src + src_index_0));
+        float2 data1 = *((__global float2 *)((__global char *)src + src_index_1));
+
+        *((__global float2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_cols_C2_D6 (__global double *src, int src_step, int src_offset,
+                                      __global double *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
+
+        double2 data0 = *((__global double2 *)((__global char *)src + src_index_0));
+        double2 data1 = *((__global double2 *)((__global char *)src + src_index_1));
+
+        *((__global double2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
+
+__kernel void arithm_flip_cols_C3_D0 (__global uchar *src, int src_step, int src_offset,
+                                      __global uchar *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x) * 3           + src_offset);
+        int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x) * 3           + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
+
+        uchar data0_0 = *(src + src_index_0 + 0);
+        uchar data0_1 = *(src + src_index_0 + 1);
+        uchar data0_2 = *(src + src_index_0 + 2);
+
+        uchar data1_0 = *(src + src_index_1 + 0);
+        uchar data1_1 = *(src + src_index_1 + 1);
+        uchar data1_2 = *(src + src_index_1 + 2);
+
+        *(dst + dst_index_0 + 0 ) = data1_0;
+        *(dst + dst_index_0 + 1 ) = data1_1;
+        *(dst + dst_index_0 + 2 ) = data1_2;
+
+        *(dst + dst_index_1 + 0) = data0_0;
+        *(dst + dst_index_1 + 1) = data0_1;
+        *(dst + dst_index_1 + 2) = data0_2;
+    }
+}
+__kernel void arithm_flip_cols_C3_D1 (__global char *src, int src_step, int src_offset,
+                                      __global char *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x) * 3           + src_offset);
+        int src_index_1 = mad24(y, src_step, (cols - x -1) * 3 + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x) * 3           + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, (cols - x -1) * 3 + dst_offset);
+
+        char data0_0 = *(src + src_index_0 + 0);
+        char data0_1 = *(src + src_index_0 + 1);
+        char data0_2 = *(src + src_index_0 + 2);
+
+        char data1_0 = *(src + src_index_1 + 0);
+        char data1_1 = *(src + src_index_1 + 1);
+        char data1_2 = *(src + src_index_1 + 2);
+
+        *(dst + dst_index_0 + 0 ) = data1_0;
+        *(dst + dst_index_0 + 1 ) = data1_1;
+        *(dst + dst_index_0 + 2 ) = data1_2;
+
+        *(dst + dst_index_1 + 0) = data0_0;
+        *(dst + dst_index_1 + 1) = data0_1;
+        *(dst + dst_index_1 + 2) = data0_2;
+    }
+}
+__kernel void arithm_flip_cols_C3_D2 (__global ushort *src, int src_step, int src_offset,
+                                      __global ushort *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x * 3 << 1)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x * 3 << 1)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
+
+        ushort data0_0 = *((__global ushort *)((__global char *)src + src_index_0 + 0));
+        ushort data0_1 = *((__global ushort *)((__global char *)src + src_index_0 + 2));
+        ushort data0_2 = *((__global ushort *)((__global char *)src + src_index_0 + 4));
+
+        ushort data1_0 = *((__global ushort *)((__global char *)src + src_index_1 + 0));
+        ushort data1_1 = *((__global ushort *)((__global char *)src + src_index_1 + 2));
+        ushort data1_2 = *((__global ushort *)((__global char *)src + src_index_1 + 4));
+
+        *((__global ushort *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global ushort *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
+        *((__global ushort *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
+
+        *((__global ushort *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global ushort *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
+        *((__global ushort *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
+    }
+}
+__kernel void arithm_flip_cols_C3_D3 (__global short *src, int src_step, int src_offset,
+                                      __global short *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x * 3 << 1)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x * 3 << 1)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
+
+        short data0_0 = *((__global short *)((__global char *)src + src_index_0 + 0));
+        short data0_1 = *((__global short *)((__global char *)src + src_index_0 + 2));
+        short data0_2 = *((__global short *)((__global char *)src + src_index_0 + 4));
+
+        short data1_0 = *((__global short *)((__global char *)src + src_index_1 + 0));
+        short data1_1 = *((__global short *)((__global char *)src + src_index_1 + 2));
+        short data1_2 = *((__global short *)((__global char *)src + src_index_1 + 4));
+
+        *((__global short *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global short *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
+        *((__global short *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
+
+        *((__global short *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global short *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
+        *((__global short *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
+    }
+}
+__kernel void arithm_flip_cols_C3_D4 (__global int *src, int src_step, int src_offset,
+                                      __global int *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x * 3 << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x * 3 << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
+
+        int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
+        int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
+        int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
+           
+        int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
+        int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
+        int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
+
+        *((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
+        *((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
+                                                                 
+        *((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
+        *((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
+    }
+}
+__kernel void arithm_flip_cols_C3_D5 (__global float *src, int src_step, int src_offset,
+                                      __global float *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x * 3 << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x * 3 << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
+
+        float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
+        float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
+        float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
+             
+        float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
+        float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
+        float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
+
+        *((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
+        *((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
+                                                                   
+        *((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
+        *((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_cols_C3_D6 (__global double *src, int src_step, int src_offset,
+                                      __global double *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x * 3 << 3)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) * 3 << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x * 3 << 3)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
+
+        double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0));
+        double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8));
+        double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
+              
+        double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0));
+        double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8));
+        double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
+
+        *((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
+        *((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
+        *((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
+                                                                    
+        *((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
+        *((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
+        *((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
+    }
+}
+#endif
+__kernel void arithm_flip_cols_C4_D0 (__global uchar *src, int src_step, int src_offset,
+                                      __global uchar *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        uchar4 data0 = *((__global uchar4 *)(src + src_index_0));
+        uchar4 data1 = *((__global uchar4 *)(src + src_index_1));
+
+        *((__global uchar4 *)(dst + dst_index_0)) = data1;
+        *((__global uchar4 *)(dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C4_D1 (__global char *src, int src_step, int src_offset,
+                                      __global char *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        char4 data0 = *((__global char4 *)(src + src_index_0));
+        char4 data1 = *((__global char4 *)(src + src_index_1));
+
+        *((__global char4 *)(dst + dst_index_0)) = data1;
+        *((__global char4 *)(dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C4_D2 (__global ushort *src, int src_step, int src_offset,
+                                      __global ushort *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        ushort4 data0 = *((__global ushort4 *)((__global char *)src + src_index_0));
+        ushort4 data1 = *((__global ushort4 *)((__global char *)src + src_index_1));
+
+        *((__global ushort4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global ushort4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C4_D3 (__global short *src, int src_step, int src_offset,
+                                      __global short *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        short4 data0 = *((__global short4 *)((__global char *)src + src_index_0));
+        short4 data1 = *((__global short4 *)((__global char *)src + src_index_1));
+
+        *((__global short4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global short4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+__kernel void arithm_flip_cols_C4_D4 (__global int *src, int src_step, int src_offset,
+                                      __global int *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
+
+        int4 data0 = *((__global int4 *)((__global char *)src + src_index_0));
+        int4 data1 = *((__global int4 *)((__global char *)src + src_index_1));
+
+        *((__global int4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_cols_C4_D5 (__global float *src, int src_step, int src_offset,
+                                      __global float *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 4)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 4) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 4)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 4) + dst_offset);
+
+        float4 data0 = *((__global float4 *)((__global char *)src + src_index_0));
+        float4 data1 = *((__global float4 *)((__global char *)src + src_index_1));
+
+        *((__global float4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_cols_C4_D6 (__global double *src, int src_step, int src_offset,
+                                      __global double *dst, int dst_step, int dst_offset,
+                                      int rows, int cols, int thread_cols, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < thread_cols && y < rows)
+    {
+        int src_index_0 = mad24(y, src_step, (x << 5)             + src_offset);
+        int src_index_1 = mad24(y, src_step, ((cols - x -1) << 5) + src_offset);
+        
+        int dst_index_0 = mad24(y, dst_step, (x << 5)             + dst_offset);
+        int dst_index_1 = mad24(y, dst_step, ((cols - x -1) << 5) + dst_offset);
+
+        double4 data0 = *((__global double4 *)((__global char *)src + src_index_0));
+        double4 data1 = *((__global double4 *)((__global char *)src + src_index_1));
+
+        *((__global double4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_flip_rc.cl
+++ b/modules/ocl/src/kernels/arithm_flip_rc.cl
@ -0,0 +1,753 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////flip rows and cols///////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_offset,
+                                    __global uchar *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x)           + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x)           + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
+
+        uchar data0 = *(src + src_index_0);
+        uchar data1 = *(src + src_index_1);
+
+        *(dst + dst_index_0) = data1;
+        *(dst + dst_index_1) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_offset,
+                                    __global char *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x)           + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x)           + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
+
+        char data0 = *(src + src_index_0);
+        char data1 = *(src + src_index_1);
+
+        *(dst + dst_index_0) = data1;
+        *(dst + dst_index_1) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_offset,
+                                    __global ushort *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        ushort data0 = *((__global ushort *)((__global char *)src + src_index_0));
+        ushort data1 = *((__global ushort *)((__global char *)src + src_index_1));
+
+        *((__global ushort *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global ushort *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_offset,
+                                    __global short *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        short data0 = *((__global short *)((__global char *)src + src_index_0));
+        short data1 = *((__global short *)((__global char *)src + src_index_1));
+
+        *((__global short *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global short *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_offset,
+                                    __global int *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        int data0 = *((__global int *)((__global char *)src + src_index_0));
+        int data1 = *((__global int *)((__global char *)src + src_index_1));
+
+        *((__global int *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_offset,
+                                    __global float *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        float data0 = *((__global float *)((__global char *)src + src_index_0));
+        float data1 = *((__global float *)((__global char *)src + src_index_1));
+
+        *((__global float *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_offset,
+                                    __global double *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        double data0 = *((__global double *)((__global char *)src + src_index_0));
+        double data1 = *((__global double *)((__global char *)src + src_index_1));
+
+        *((__global double *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
+__kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_offset,
+                                    __global uchar *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        uchar2 data0 = *((__global uchar2 *)(src + src_index_0));
+        uchar2 data1 = *((__global uchar2 *)(src + src_index_1));
+
+        *((__global uchar2 *)(dst + dst_index_0)) = data1;
+        *((__global uchar2 *)(dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_offset,
+                                    __global char *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 1)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 1)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
+
+        char2 data0 = *((__global char2 *)(src + src_index_0));
+        char2 data1 = *((__global char2 *)(src + src_index_1));
+
+        *((__global char2 *)(dst + dst_index_0)) = data1;
+        *((__global char2 *)(dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_offset,
+                                    __global ushort *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        ushort2 data0 = *((__global ushort2 *)((__global char *)src + src_index_0));
+        ushort2 data1 = *((__global ushort2 *)((__global char *)src + src_index_1));
+
+        *((__global ushort2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global ushort2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_offset,
+                                    __global short *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        short2 data0 = *((__global short2 *)((__global char *)src + src_index_0));
+        short2 data1 = *((__global short2 *)((__global char *)src + src_index_1));
+
+        *((__global short2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global short2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_offset,
+                                    __global int *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        int2 data0 = *((__global int2 *)((__global char *)src + src_index_0));
+        int2 data1 = *((__global int2 *)((__global char *)src + src_index_1));
+
+        *((__global int2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_offset,
+                                    __global float *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        float2 data0 = *((__global float2 *)((__global char *)src + src_index_0));
+        float2 data1 = *((__global float2 *)((__global char *)src + src_index_1));
+
+        *((__global float2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_offset,
+                                    __global double *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
+
+        double2 data0 = *((__global double2 *)((__global char *)src + src_index_0));
+        double2 data1 = *((__global double2 *)((__global char *)src + src_index_1));
+
+        *((__global double2 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double2 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
+
+__kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_offset,
+                                    __global uchar *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3)            + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3  + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3)           + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
+
+
+        uchar data0_0 = *(src + src_index_0 + 0);
+        uchar data0_1 = *(src + src_index_0 + 1);
+        uchar data0_2 = *(src + src_index_0 + 2);
+
+        uchar data1_0 = *(src + src_index_1 + 0);
+        uchar data1_1 = *(src + src_index_1 + 1);
+        uchar data1_2 = *(src + src_index_1 + 2);
+
+        *(dst + dst_index_0 + 0 ) = data1_0;
+        *(dst + dst_index_0 + 1 ) = data1_1;
+        *(dst + dst_index_0 + 2 ) = data1_2;
+
+        *(dst + dst_index_1 + 0) = data0_0;
+        *(dst + dst_index_1 + 1) = data0_1;
+        *(dst + dst_index_1 + 2) = data0_2;
+    }
+}
+__kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_offset,
+                                    __global char *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3)            + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3  + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3)           + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
+
+
+        char data0_0 = *(src + src_index_0 + 0);
+        char data0_1 = *(src + src_index_0 + 1);
+        char data0_2 = *(src + src_index_0 + 2);
+
+        char data1_0 = *(src + src_index_1 + 0);
+        char data1_1 = *(src + src_index_1 + 1);
+        char data1_2 = *(src + src_index_1 + 2);
+
+        *(dst + dst_index_0 + 0 ) = data1_0;
+        *(dst + dst_index_0 + 1 ) = data1_1;
+        *(dst + dst_index_0 + 2 ) = data1_2;
+
+        *(dst + dst_index_1 + 0) = data0_0;
+        *(dst + dst_index_1 + 1) = data0_1;
+        *(dst + dst_index_1 + 2) = data0_2;
+    }
+}
+__kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_offset,
+                                    __global ushort *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3 << 1)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 1)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
+
+        ushort data0_0 = *((__global ushort *)((__global char *)src + src_index_0 + 0));
+        ushort data0_1 = *((__global ushort *)((__global char *)src + src_index_0 + 2));
+        ushort data0_2 = *((__global ushort *)((__global char *)src + src_index_0 + 4));
+
+        ushort data1_0 = *((__global ushort *)((__global char *)src + src_index_1 + 0));
+        ushort data1_1 = *((__global ushort *)((__global char *)src + src_index_1 + 2));
+        ushort data1_2 = *((__global ushort *)((__global char *)src + src_index_1 + 4));
+
+        *((__global ushort *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global ushort *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
+        *((__global ushort *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
+
+        *((__global ushort *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global ushort *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
+        *((__global ushort *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
+    }
+}
+__kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_offset,
+                                    __global short *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3 << 1)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 1)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
+
+        short data0_0 = *((__global short *)((__global char *)src + src_index_0 + 0));
+        short data0_1 = *((__global short *)((__global char *)src + src_index_0 + 2));
+        short data0_2 = *((__global short *)((__global char *)src + src_index_0 + 4));
+
+        short data1_0 = *((__global short *)((__global char *)src + src_index_1 + 0));
+        short data1_1 = *((__global short *)((__global char *)src + src_index_1 + 2));
+        short data1_2 = *((__global short *)((__global char *)src + src_index_1 + 4));
+
+        *((__global short *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global short *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
+        *((__global short *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
+
+        *((__global short *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global short *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
+        *((__global short *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
+    }
+}
+
+__kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_offset,
+                                    __global int *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3 << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
+
+        int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
+        int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
+        int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
+           
+        int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
+        int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
+        int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
+
+        *((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
+        *((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
+                                                                 
+        *((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
+        *((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
+    }
+}
+__kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_offset,
+                                    __global float *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3 << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
+
+        float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
+        float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
+        float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
+                                                                                   
+        float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
+        float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
+        float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
+
+        *((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
+        *((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
+        *((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
+                                                                   
+        *((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
+        *((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
+        *((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_offset,
+                                    __global double *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x * 3 << 3)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x * 3 << 3)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
+
+        double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 ));
+        double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 ));
+        double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
+              
+        double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 ));
+        double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 ));
+        double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
+
+        *((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
+        *((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
+        *((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
+                                                                     
+        *((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
+        *((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
+        *((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
+    }
+}
+#endif
+__kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_offset,
+                                    __global uchar *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        uchar4 data0 = *((__global uchar4 *)(src + src_index_0));
+        uchar4 data1 = *((__global uchar4 *)(src + src_index_1));
+
+        *((__global uchar4 *)(dst + dst_index_0)) = data1;
+        *((__global uchar4 *)(dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_offset,
+                                    __global char *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 2)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 2)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
+
+        char4 data0 = *((__global char4 *)(src + src_index_0));
+        char4 data1 = *((__global char4 *)(src + src_index_1));
+
+        *((__global char4 *)(dst + dst_index_0)) = data1;
+        *((__global char4 *)(dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_offset,
+                                    __global ushort *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        ushort4 data0 = *((__global ushort4 *)((__global char *)src + src_index_0));
+        ushort4 data1 = *((__global ushort4 *)((__global char *)src + src_index_1));
+
+        *((__global ushort4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global ushort4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_offset,
+                                    __global short *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 3)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 3)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
+
+        short4 data0 = *((__global short4 *)((__global char *)src + src_index_0));
+        short4 data1 = *((__global short4 *)((__global char *)src + src_index_1));
+
+        *((__global short4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global short4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_offset,
+                                    __global int *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
+
+        int4 data0 = *((__global int4 *)((__global char *)src + src_index_0));
+        int4 data1 = *((__global int4 *)((__global char *)src + src_index_1));
+
+        *((__global int4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global int4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+__kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_offset,
+                                    __global float *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 4)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 4)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
+
+        float4 data0 = *((__global float4 *)((__global char *)src + src_index_0));
+        float4 data1 = *((__global float4 *)((__global char *)src + src_index_1));
+
+        *((__global float4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global float4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_offset,
+                                    __global double *dst, int dst_step, int dst_offset,
+                                    int rows, int cols, int thread_rows, int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < thread_rows)
+    {
+        int src_index_0 = mad24(y,            src_step, (x << 5)             + src_offset);
+        int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset);
+        
+        int dst_index_0 = mad24(y,            dst_step, (x << 5)             + dst_offset);
+        int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset);
+
+        double4 data0 = *((__global double4 *)((__global char *)src + src_index_0));
+        double4 data1 = *((__global double4 *)((__global char *)src + src_index_1));
+
+        *((__global double4 *)((__global char *)dst + dst_index_0)) = data1;
+        *((__global double4 *)((__global char *)dst + dst_index_1)) = data0;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_log.cl
+++ b/modules/ocl/src/kernels/arithm_log.cl
@ -0,0 +1,94 @@
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wu Zailong, bullet@yeah.net
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#define INF_FLOAT -88.029694
+#define INF_DOUBLE -709.0895657128241 
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////LOG/////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows )
+    {
+      x = x << 2;
+      int srcIdx = mad24( y, srcStep, x + srcOffset);
+      int dstIdx = mad24( y, dstStep, x + dstOffset);
+
+      float src_data = *((__global float *)((__global char *)src + srcIdx));
+      float dst_data = (src_data == 0) ? INF_FLOAT : log(fabs(src_data));
+
+      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+    }
+}
+
+
+__kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows )
+    {
+      x = x << 3;
+      int srcIdx = mad24( y, srcStep, x + srcOffset);
+      int dstIdx = mad24( y, dstStep, x + dstOffset);
+
+      double src_data = *((__global double *)((__global char *)src + srcIdx));
+      double dst_data = (src_data == 0) ? INF_DOUBLE : log(fabs(src_data));
+      *((__global double *)((__global char *)dst + dstIdx)) = dst_data;
+
+    }
+}
+
--- a/modules/ocl/src/kernels/arithm_magnitude.cl
+++ b/modules/ocl/src/kernels/arithm_magnitude.cl
@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+__kernel void arithm_magnitude_D5 (__global float *src1, int src1_step, int src1_offset,
+                                   __global float *src2, int src2_step, int src2_offset,
+                                   __global float *dst,  int dst_step,  int dst_offset,
+                                  int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float data2 = *((__global float *)((__global char *)src2 + src2_index));
+
+        float tmp = sqrt(data1 * data1 + data2 * data2);
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_magnitude_D6 (__global double *src1, int src1_step, int src1_offset,
+                                   __global double *src2, int src2_step, int src2_offset,
+                                   __global double *dst,  int dst_step,  int dst_offset,
+                                  int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+
+        double tmp = sqrt(data1 * data1 + data2 * data2);
+
+        *((__global double *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
@ -0,0 +1,153 @@
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this softwareif advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////magnitudeSqr//////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_offset,
+                           __global float *src2, int src2_step,int src2_offset,
+                           __global float *dst,  int dst_step,int dst_offset,
+                           int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+
+    {
+            
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 2) & 3)
+
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+       
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
+
+        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index));
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
+
+        float4   tmp_data  ;
+      tmp_data.x = src1_data.x * src1_data.x + src2_data.x * src2_data.x;
+
+      tmp_data.y = src1_data.y * src1_data.y + src2_data.y * src2_data.y;
+
+      tmp_data.z = src1_data.z * src1_data.z + src2_data.z * src2_data.z;
+
+      tmp_data.w = src1_data.w * src1_data.w + src2_data.w * src2_data.w;
+
+
+
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+}
+
+
+#if defined (DOUBLE_SUPPORT)
+
+__kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_offset,
+                           __global float *dst,  int dst_step,int dst_offset,
+                           int rows,  int cols,int dst_step1)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    
+
+    {
+            
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 2) & 3)
+
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
+       
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
+
+        float8 src1_data = vload8(0, (__global float  *)((__global char *)src1 + src1_index));
+        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
+
+        float4   tmp_data  ;
+      tmp_data.x = src1_data.s0 * src1_data.s0 + src1_data.s1 * src1_data.s1;
+
+      tmp_data.y = src1_data.s2 * src1_data.s2 + src1_data.s3 * src1_data.s3;
+
+      tmp_data.z = src1_data.s4 * src1_data.s4 + src1_data.s5 * src1_data.s5;
+
+      tmp_data.w = src1_data.s6 * src1_data.s6 + src1_data.s7 * src1_data.s7;
+
+
+
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_minMax.cl
+++ b/modules/ocl/src/kernels/arithm_minMax.cl
@ -0,0 +1,218 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if defined (DEPTH_0)
+#define VEC_TYPE uchar8
+#define CONVERT_TYPE convert_uchar8
+#define MIN_VAL 0
+#define MAX_VAL 255
+#endif
+#if defined (DEPTH_1)
+#define VEC_TYPE char8
+#define CONVERT_TYPE convert_char8
+#define MIN_VAL -128 
+#define MAX_VAL 127
+#endif
+#if defined (DEPTH_2)
+#define VEC_TYPE ushort8
+#define CONVERT_TYPE convert_ushort8
+#define MIN_VAL 0 
+#define MAX_VAL 65535
+#endif
+#if defined (DEPTH_3)
+#define VEC_TYPE short8
+#define CONVERT_TYPE convert_short8
+#define MIN_VAL -32768 
+#define MAX_VAL 32767
+#endif
+#if defined (DEPTH_4)
+#define VEC_TYPE int8
+#define CONVERT_TYPE convert_int8
+#define MIN_VAL INT_MIN 
+#define MAX_VAL INT_MAX
+#endif
+#if defined (DEPTH_5)
+#define VEC_TYPE float8
+#define CONVERT_TYPE convert_float8
+#define MIN_VAL (-FLT_MAX) 
+#define MAX_VAL FLT_MAX
+#endif
+#if defined (DEPTH_6)
+#define VEC_TYPE double8
+#define CONVERT_TYPE convert_double8
+#define MIN_VAL (-DBL_MAX) 
+#define MAX_VAL DBL_MAX
+#endif
+
+#if defined (REPEAT_S0)
+#define repeat_s(a) a = a;
+#endif
+#if defined (REPEAT_S1)
+#define repeat_s(a) a.s0 = a.s1;
+#endif
+#if defined (REPEAT_S2)
+#define repeat_s(a) a.s0 = a.s2;a.s1 = a.s2;
+#endif
+#if defined (REPEAT_S3)
+#define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3;
+#endif
+#if defined (REPEAT_S4)
+#define repeat_s(a) a.s0 = a.s4;a.s1 = a.s4;a.s2 = a.s4;a.s3 = a.s4;
+#endif
+#if defined (REPEAT_S5)
+#define repeat_s(a) a.s0 = a.s5;a.s1 = a.s5;a.s2 = a.s5;a.s3 = a.s5;a.s4 = a.s5;
+#endif
+#if defined (REPEAT_S6)
+#define repeat_s(a) a.s0 = a.s6;a.s1 = a.s6;a.s2 = a.s6;a.s3 = a.s6;a.s4 = a.s6;a.s5 = a.s6;
+#endif
+#if defined (REPEAT_S7)
+#define repeat_s(a) a.s0 = a.s7;a.s1 = a.s7;a.s2 = a.s7;a.s3 = a.s7;a.s4 = a.s7;a.s5 = a.s7;a.s6 = a.s7;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_e(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_e(a) a.s7 = a.s6;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+
+/**************************************Array minMax**************************************/
+__kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                  __global VEC_TYPE *src, __global VEC_TYPE *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   __local VEC_TYPE localmem_max[128],localmem_min[128];
+   VEC_TYPE minval,maxval,temp;
+   if(id < elemnum)
+   {
+       temp = src[idx];
+       if(id % cols == 0 ) 
+       {
+           repeat_s(temp);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_e(temp);
+       }
+       minval = temp;
+       maxval = temp;
+   }
+   else
+   {
+       minval = MAX_VAL;
+       maxval = MIN_VAL;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = src[idx];
+       if(id % cols == 0 ) 
+       {
+               repeat_s(temp);
+       }
+       if(id % cols == cols - 1)
+       {
+               repeat_e(temp);
+       }
+       minval = min(minval,temp);
+       maxval = max(maxval,temp);
+   }
+   if(lid > 127)
+   {
+       localmem_min[lid - 128] = minval;
+       localmem_max[lid - 128] = maxval;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_min[lid] = min(minval,localmem_min[lid]);
+       localmem_max[lid] = max(maxval,localmem_max[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = localmem_min[0];
+       dst[gid + groupnum] = localmem_max[0];
+   }
+}
--- a/modules/ocl/src/kernels/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/kernels/arithm_minMaxLoc.cl
@ -0,0 +1,423 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan, yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define RES_TYPE double8
+#define CONVERT_RES_TYPE convert_double8
+#else
+#define RES_TYPE float8
+#define CONVERT_RES_TYPE convert_float8
+#endif
+
+#if defined (DEPTH_0)
+#define VEC_TYPE uchar8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_uchar8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL 0
+#define MAX_VAL 255
+#endif
+#if defined (DEPTH_1)
+#define VEC_TYPE char8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_char8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL -128 
+#define MAX_VAL 127
+#endif
+#if defined (DEPTH_2)
+#define VEC_TYPE ushort8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_ushort8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL 0 
+#define MAX_VAL 65535
+#endif
+#if defined (DEPTH_3)
+#define VEC_TYPE short8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_short8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL -32768 
+#define MAX_VAL 32767
+#endif
+#if defined (DEPTH_4)
+#define VEC_TYPE int8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_int8
+#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
+#define MIN_VAL INT_MIN 
+#define MAX_VAL INT_MAX
+#endif
+#if defined (DEPTH_5)
+#define VEC_TYPE float8
+#define VEC_TYPE_LOC float8
+#define CONVERT_TYPE convert_float8
+#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
+#define MIN_VAL (-FLT_MAX) 
+#define MAX_VAL FLT_MAX
+#endif
+#if defined (DEPTH_6)
+#define VEC_TYPE double8
+#define VEC_TYPE_LOC double8
+#define CONVERT_TYPE convert_double8
+#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
+#define MIN_VAL (-DBL_MAX) 
+#define MAX_VAL DBL_MAX
+#endif
+
+#if defined (REPEAT_S0)
+#define repeat_s(a) a=a; 
+#endif
+#if defined (REPEAT_S1)
+#define repeat_s(a) a.s0 = a.s1;
+#endif
+#if defined (REPEAT_S2)
+#define repeat_s(a) a.s0 = a.s2;a.s1 = a.s2;
+#endif
+#if defined (REPEAT_S3)
+#define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3;
+#endif
+#if defined (REPEAT_S4)
+#define repeat_s(a) a.s0 = a.s4;a.s1 = a.s4;a.s2 = a.s4;a.s3 = a.s4;
+#endif
+#if defined (REPEAT_S5)
+#define repeat_s(a) a.s0 = a.s5;a.s1 = a.s5;a.s2 = a.s5;a.s3 = a.s5;a.s4 = a.s5;
+#endif
+#if defined (REPEAT_S6)
+#define repeat_s(a) a.s0 = a.s6;a.s1 = a.s6;a.s2 = a.s6;a.s3 = a.s6;a.s4 = a.s6;a.s5 = a.s6;
+#endif
+#if defined (REPEAT_S7)
+#define repeat_s(a) a.s0 = a.s7;a.s1 = a.s7;a.s2 = a.s7;a.s3 = a.s7;a.s4 = a.s7;a.s5 = a.s7;a.s6 = a.s7;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_e(a) a=a; 
+#endif
+#if defined (REPEAT_E1)
+#define repeat_e(a) a.s7 = a.s6;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+
+/**************************************Array minMax**************************************/
+__kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                  __global VEC_TYPE *src, __global RES_TYPE *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   __local VEC_TYPE localmem_max[128],localmem_min[128];
+   VEC_TYPE minval,maxval,temp;
+   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
+   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
+   int idx_c;
+   if(id < elemnum)
+   {
+       temp = src[idx];
+       idx_c = idx << 3;
+       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7);
+       if(id % cols == 0 ) 
+       {
+           repeat_s(temp);
+           repeat_s(temploc);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_e(temp);
+           repeat_e(temploc);
+       }
+       minval = temp;
+       maxval = temp;
+       minloc = temploc;
+       maxloc = temploc;
+   }
+   else
+   {
+       minval = MAX_VAL;
+       maxval = MIN_VAL;
+       minloc = negative;
+       maxloc = negative;
+   }
+   float8 aaa;
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = src[idx];
+       idx_c = idx << 3;
+       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7);
+       if(id % cols == 0 ) 
+       {
+               repeat_s(temp);
+               repeat_s(temploc);
+       }
+       if(id % cols == cols - 1)
+       {
+               repeat_e(temp);
+               repeat_e(temploc);
+       }
+       minval = min(minval,temp);
+       maxval = max(maxval,temp);
+       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
+       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
+       aaa= convert_float8(maxval == temp);
+       maxloc = convert_int8(aaa) ? temploc : maxloc;
+   }
+   if(lid > 127)
+   {
+       localmem_min[lid - 128] = minval;
+       localmem_max[lid - 128] = maxval;
+       localmem_minloc[lid - 128] = minloc;
+       localmem_maxloc[lid - 128] = maxloc;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_min[lid] = min(minval,localmem_min[lid]);
+       localmem_max[lid] = max(maxval,localmem_max[lid]);
+       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
+       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
+           localmem_minloc[lid] = 
+                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
+           localmem_maxloc[lid] = 
+                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
+       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
+       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
+       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
+   }
+}
+
+#if defined (REPEAT_S0)
+#define repeat_ms(a) a = a;
+#endif
+#if defined (REPEAT_S1)
+#define repeat_ms(a) a.s0 = 0;
+#endif
+#if defined (REPEAT_S2)
+#define repeat_ms(a) a.s0 = 0;a.s1 = 0;
+#endif
+#if defined (REPEAT_S3)
+#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_S4)
+#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_S5)
+#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_S6)
+#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_S7)
+#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_me(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_me(a) a.s7 = 0;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
+#endif
+
+/**************************************Array minMaxLoc mask**************************************/
+__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global VEC_TYPE *src,
+                                        int minvalid_cols,int moffset,__global uchar8 *mask,__global RES_TYPE  *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
+   __local VEC_TYPE localmem_max[128],localmem_min[128];
+   VEC_TYPE minval,maxval,temp,max_val = MAX_VAL,min_val = MIN_VAL,zero = 0,m_temp;
+   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
+   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
+   if(id < elemnum)
+   {
+       temp = src[idx];
+       m_temp = CONVERT_TYPE(mask[midx]);
+       int idx_c = idx << 3;
+       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7);
+       if(id % cols == 0 ) 
+       {
+           repeat_ms(m_temp);
+           repeat_s(temploc);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_me(m_temp);
+           repeat_e(temploc);
+       }
+       minval = m_temp > zero ? temp : max_val;
+       maxval = m_temp > zero ? temp : min_val;
+       minloc = CONDITION_FUNC(m_temp > zero, temploc , negative);
+       maxloc = minloc;
+   }
+   else
+   {
+       minval = MAX_VAL;
+       maxval = MIN_VAL;
+       minloc = negative;
+       maxloc = negative;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       midx = moffset + id + (id / cols) * minvalid_cols;
+       temp = src[idx];
+       m_temp = CONVERT_TYPE(mask[midx]);
+       int idx_c = idx << 3;
+       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7);
+       if(id % cols == 0 ) 
+       {
+           repeat_ms(m_temp);
+           repeat_s(temploc);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_me(m_temp);
+           repeat_e(temploc);
+       }
+       minval = min(minval,m_temp > zero ? temp : max_val);
+       maxval = max(maxval,m_temp > zero ? temp : min_val);
+       
+       temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
+       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
+       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
+   }
+   if(lid > 127)
+   {
+       localmem_min[lid - 128] = minval;
+       localmem_max[lid - 128] = maxval;
+       localmem_minloc[lid - 128] = minloc;
+       localmem_maxloc[lid - 128] = maxloc;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_min[lid] = min(minval,localmem_min[lid]);
+       localmem_max[lid] = max(maxval,localmem_max[lid]);
+       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
+       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
+           localmem_minloc[lid] = 
+                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
+           localmem_maxloc[lid] = 
+                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
+       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
+       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
+       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
+   }
+}
+
--- a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
+++ b/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
@ -0,0 +1,267 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan, yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define RES_TYPE double8
+#define CONVERT_RES_TYPE convert_double8
+#else
+#define RES_TYPE float8
+#define CONVERT_RES_TYPE convert_float8
+#endif
+
+#if defined (DEPTH_0)
+#define TYPE uchar
+#define VEC_TYPE uchar8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_uchar8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL 0
+#define MAX_VAL 255
+#endif
+#if defined (DEPTH_1)
+#define TYPE char
+#define VEC_TYPE char8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_char8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL -128 
+#define MAX_VAL 127
+#endif
+#if defined (DEPTH_2)
+#define TYPE ushort
+#define VEC_TYPE ushort8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_ushort8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL 0 
+#define MAX_VAL 65535
+#endif
+#if defined (DEPTH_3)
+#define TYPE short
+#define VEC_TYPE short8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_short8
+#define CONDITION_FUNC(a,b,c) (convert_int8(a) ? b : c)
+#define MIN_VAL -32768 
+#define MAX_VAL 32767
+#endif
+#if defined (DEPTH_4)
+#define TYPE int
+#define VEC_TYPE int8
+#define VEC_TYPE_LOC int8
+#define CONVERT_TYPE convert_int8
+#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
+#define MIN_VAL INT_MIN 
+#define MAX_VAL INT_MAX
+#endif
+#if defined (DEPTH_5)
+#define TYPE float
+#define VEC_TYPE float8
+#define VEC_TYPE_LOC float8
+#define CONVERT_TYPE convert_float8
+#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
+#define MIN_VAL (-FLT_MAX) 
+#define MAX_VAL FLT_MAX
+#endif
+#if defined (DEPTH_6)
+#define TYPE double
+#define VEC_TYPE double8
+#define VEC_TYPE_LOC double8
+#define CONVERT_TYPE convert_double8
+#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
+#define MIN_VAL (-DBL_MAX) 
+#define MAX_VAL DBL_MAX
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_e(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_e(a) a.s7 = a.s6;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_me(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_me(a) a.s7 = 0;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
+#endif
+
+/**************************************Array minMaxLoc mask**************************************/
+__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src,
+                                        int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE  *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = id + (id / cols) * invalid_cols;
+   unsigned int midx = id + (id / cols) * minvalid_cols;
+   __local VEC_TYPE lm_max[128],lm_min[128];
+   VEC_TYPE minval,maxval,temp,m_temp;
+   __local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128];
+   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0;
+   if(id < elemnum)
+   {
+       temp = vload8(idx, &src[offset]);
+       m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
+       int idx_c = (idx << 3) + offset;
+       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7);
+       if(id % cols == cols - 1)
+       {
+           repeat_me(m_temp);
+           repeat_e(temploc);
+       }
+       minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
+       maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
+       minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative);
+       maxloc = minloc;
+   }
+   else
+   {
+       minval = MAX_VAL;
+       maxval = MIN_VAL;
+       minloc = negative;
+       maxloc = negative;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = id + (id / cols) * invalid_cols;
+       midx = id + (id / cols) * minvalid_cols;
+       temp = vload8(idx, &src[offset]);
+       m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
+       int idx_c = (idx << 3) + offset;
+       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3,idx_c+4,idx_c+5,idx_c+6,idx_c+7);
+       if(id % cols == cols - 1)
+       {
+           repeat_me(m_temp);
+           repeat_e(temploc);
+       }
+       minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
+       maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
+       
+       minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
+       maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
+   }
+   if(lid > 127)
+   {
+       lm_min[lid - 128] = minval;
+       lm_max[lid - 128] = maxval;
+       lm_minloc[lid - 128] = minloc;
+       lm_maxloc[lid - 128] = maxloc;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       lm_min[lid] = min(minval,lm_min[lid]);
+       lm_max[lid] = max(maxval,lm_max[lid]);
+       VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
+       VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
+       lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]);
+       lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           lm_min[lid] = min(lm_min[lid] , lm_min[lid2]);
+           lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
+           VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
+           VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
+           lm_minloc[lid] = 
+              CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
+           lm_maxloc[lid] = 
+              CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
+       dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
+       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
+       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
+   }
+}
+
--- a/modules/ocl/src/kernels/arithm_minMax_mask.cl
+++ b/modules/ocl/src/kernels/arithm_minMax_mask.cl
@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if defined (DEPTH_0)
+#define VEC_TYPE uchar8
+#define TYPE uchar
+#define CONVERT_TYPE convert_uchar8
+#define MIN_VAL 0
+#define MAX_VAL 255
+#endif
+#if defined (DEPTH_1)
+#define VEC_TYPE char8
+#define TYPE char
+#define CONVERT_TYPE convert_char8
+#define MIN_VAL -128 
+#define MAX_VAL 127
+#endif
+#if defined (DEPTH_2)
+#define VEC_TYPE ushort8
+#define TYPE ushort
+#define CONVERT_TYPE convert_ushort8
+#define MIN_VAL 0 
+#define MAX_VAL 65535
+#endif
+#if defined (DEPTH_3)
+#define VEC_TYPE short8
+#define TYPE short
+#define CONVERT_TYPE convert_short8
+#define MIN_VAL -32768 
+#define MAX_VAL 32767
+#endif
+#if defined (DEPTH_4)
+#define VEC_TYPE int8
+#define TYPE int
+#define CONVERT_TYPE convert_int8
+#define MIN_VAL INT_MIN 
+#define MAX_VAL INT_MAX
+#endif
+#if defined (DEPTH_5)
+#define VEC_TYPE float8
+#define TYPE float
+#define CONVERT_TYPE convert_float8
+#define MIN_VAL (-FLT_MAX) 
+#define MAX_VAL FLT_MAX
+#endif
+#if defined (DEPTH_6)
+#define VEC_TYPE double8
+#define TYPE double
+#define CONVERT_TYPE convert_double8
+#define MIN_VAL (-DBL_MAX) 
+#define MAX_VAL DBL_MAX
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_me(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_me(a) a.s7 = 0;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+
+/**************************************Array minMax mask**************************************/
+__kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum, __global TYPE *src,
+                                     int minvalid_cols,int moffset, __global uchar *mask,__global VEC_TYPE *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = id + (id / cols) * invalid_cols;
+   unsigned int midx = id + (id / cols) * minvalid_cols;
+   __local VEC_TYPE localmem_max[128],localmem_min[128];
+   VEC_TYPE minval,maxval,temp,m_temp;
+   if(id < elemnum)
+   {
+       temp = vload8(idx, &src[offset]);
+       m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
+       if(id % cols == cols - 1)
+       {
+           repeat_me(m_temp);
+       }
+       minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
+       maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
+   }
+   else
+   {
+       minval = MAX_VAL;
+       maxval = MIN_VAL;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = id + (id / cols) * invalid_cols;
+       midx = id + (id / cols) * minvalid_cols;
+       temp = vload8(idx, &src[offset]);
+       m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
+       if(id % cols == cols - 1)
+       {
+               repeat_me(m_temp);
+       }
+       minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
+       maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
+   }
+   if(lid > 127)
+   {
+       localmem_min[lid - 128] = minval;
+       localmem_max[lid - 128] = maxval;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_min[lid] = min(minval,localmem_min[lid]);
+       localmem_max[lid] = max(maxval,localmem_max[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = localmem_min[0];
+       dst[gid + groupnum] = localmem_max[0];
+   }
+}
+
--- a/modules/ocl/src/kernels/arithm_mul.cl
+++ b/modules/ocl/src/kernels/arithm_mul.cl
@ -0,0 +1,253 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+int4 round_int4(float4 v){
+    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); 
+    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); 
+    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); 
+    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5); 
+
+    return convert_int4_sat(v);
+}
+uint4 round_uint4(float4 v){
+    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); 
+    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); 
+    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); 
+    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5); 
+
+    return convert_uint4_sat(v);
+}
+long round_int(float v){
+    v = v + (v > 0 ? 0.5 : -0.5); 
+
+    return convert_int_sat(v);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////multiply//////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************add without mask**************************************/
+__kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offset,
+                             __global uchar *src2, int src2_step, int src2_offset,
+                             __global uchar *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, float scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        uchar4 src2_data = vload4(0, src2 + src2_index);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp      = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
+        tmp = round_int4(convert_float4(tmp) * scalar);
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+    }
+}
+__kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offset,
+                             __global ushort *src2, int src2_step, int src2_offset,
+                             __global ushort *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, float scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+
+        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
+        uint4    tmp = convert_uint4_sat(src1_data) * convert_uint4_sat(src2_data);
+        tmp = round_uint4(convert_float4(tmp) * scalar);
+        ushort4 tmp_data = convert_ushort4_sat(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+__kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offset,
+                             __global short *src2, int src2_step, int src2_offset,
+                             __global short *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, float scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align ((dst_offset >> 1) & 3)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
+
+        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+
+        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+        int4   tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
+        tmp = round_int4(convert_float4(tmp) * scalar);
+        short4 tmp_data = convert_short4_sat(tmp);
+
+        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+    }
+}
+
+__kernel void arithm_mul_D4 (__global int *src1, int src1_step, int src1_offset,
+                             __global int *src2, int src2_step, int src2_offset,
+                             __global int *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, float scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int data2 = *((__global int *)((__global char *)src2 + src2_index));
+        int tmp  = data1 * data2;
+        tmp = round_int((float)tmp * scalar);
+
+        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
+    }
+}
+__kernel void arithm_mul_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *src2, int src2_step, int src2_offset,
+                             __global float *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, float scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float data2 = *((__global float *)((__global char *)src2 + src2_index));
+        float tmp = data1 * data2;
+        tmp = tmp * scalar;
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *src2, int src2_step, int src2_offset,
+                             __global double *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, double scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+
+        double tmp = data1 * data2;
+        tmp = tmp * scalar;
+
+        *((__global double *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_nonzero.cl
+++ b/modules/ocl/src/kernels/arithm_nonzero.cl
@ -0,0 +1,191 @@
+////////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+///
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if defined (DEPTH_0)
+#define VEC_TYPE uchar8
+#endif
+#if defined (DEPTH_1)
+#define VEC_TYPE char8
+#endif
+#if defined (DEPTH_2)
+#define VEC_TYPE ushort8
+#endif
+#if defined (DEPTH_3)
+#define VEC_TYPE short8
+#endif
+#if defined (DEPTH_4)
+#define VEC_TYPE int8
+#endif
+#if defined (DEPTH_5)
+#define VEC_TYPE float8
+#endif
+#if defined (DEPTH_6)
+#define VEC_TYPE double8
+#endif
+
+#if defined (REPEAT_S0)
+#define repeat_s(a) a = a;
+#endif
+#if defined (REPEAT_S1)
+#define repeat_s(a) a.s0 = 0;
+#endif
+#if defined (REPEAT_S2)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;
+#endif
+#if defined (REPEAT_S3)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_S4)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_S5)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_S6)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_S7)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_e(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_e(a) a.s7 = 0;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+
+/**************************************Count NonZero**************************************/
+__kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                  __global VEC_TYPE *src, __global int8 *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   __local int8 localmem_nonzero[128];
+   int8 nonzero;
+   VEC_TYPE zero=0,one=1,temp;
+   if(id < elemnum)
+   {
+       temp = src[idx];
+       if(id % cols == 0 ) 
+       {
+           repeat_s(temp);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_e(temp);
+       }
+       nonzero = convert_int8(temp == zero ? zero:one);
+   }
+   else
+   {
+       nonzero = 0;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = src[idx];
+       if(id % cols == 0 ) 
+       {
+               repeat_s(temp);
+       }
+       if(id % cols == cols - 1)
+       {
+               repeat_e(temp);
+       }
+       nonzero = nonzero + convert_int8(temp == zero ? zero:one);
+   }
+   if(lid > 127)
+   {
+       localmem_nonzero[lid - 128] = nonzero;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_nonzero[lid] = nonzero + localmem_nonzero[lid];
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_nonzero[lid] = localmem_nonzero[lid] + localmem_nonzero[lid2];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = localmem_nonzero[0];
+   }
+}
--- a/modules/ocl/src/kernels/arithm_phase.cl
+++ b/modules/ocl/src/kernels/arithm_phase.cl
@ -0,0 +1,154 @@
+////////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#define CV_PI 3.1415926535898
+/**************************************phase inradians**************************************/
+__kernel void arithm_phase_inradians_D5 (__global float *src1, int src1_step, int src1_offset,
+                                         __global float *src2, int src2_step, int src2_offset,
+                                         __global float *dst,  int dst_step,  int dst_offset,
+                                         int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float data2 = *((__global float *)((__global char *)src2 + src2_index));
+        float tmp = atan2(data2,data1);
+        
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+
+}
+
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_phase_inradians_D6 (__global double *src1, int src1_step, int src1_offset,
+                                         __global double *src2, int src2_step, int src2_offset,
+                                         __global double *dst,  int dst_step,  int dst_offset,
+                                         int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+        
+        *((__global double *)((__global char *)dst + dst_index)) = atan2(data2,data1);
+    }
+
+}
+#endif
+
+/**************************************phase indegrees**************************************/
+__kernel void arithm_phase_indegrees_D5 (__global float *src1, int src1_step, int src1_offset,
+                                         __global float *src2, int src2_step, int src2_offset,
+                                         __global float *dst,  int dst_step,  int dst_offset,
+                                         int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float data2 = *((__global float *)((__global char *)src2 + src2_index));
+        float tmp = atan2(data2,data1);
+        float tmp_data = 180*tmp/CV_PI;
+        
+        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+
+}
+
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step, int src1_offset,
+                                         __global double *src2, int src2_step, int src2_offset,
+                                         __global double *dst,  int dst_step,  int dst_offset,
+                                         int rows, int cols, int dst_step1)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+        double tmp = atan2(data2,data1);
+        double tmp_data = 180*tmp/CV_PI;
+        
+        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
+    }
+
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_polarToCart.cl
+++ b/modules/ocl/src/kernels/arithm_polarToCart.cl
@ -0,0 +1,174 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#define CV_PI   3.1415926535897932384626433832795
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////polarToCart with magnitude//////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, int src1_offset,//magnitue
+                                         __global float *src2, int src2_step, int src2_offset,//angle
+                                         __global float *dst1, int dst1_step, int dst1_offset, 
+                                         __global float *dst2, int dst2_step, int dst2_offset, 
+                                         int rows, int cols, int angInDegree)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
+
+        int dst1_index = mad24(y, dst1_step, (x << 2) + dst1_offset);
+        int dst2_index = mad24(y, dst2_step, (x << 2) + dst2_offset);
+
+        float x = *((__global float *)((__global char *)src1 + src1_index));
+        float y = *((__global float *)((__global char *)src2 + src2_index));
+
+        float ascale = CV_PI/180.0;
+        float alpha  = angInDegree == 1 ? y * ascale : y;
+        float a = cos(alpha) * x; 
+        float b = sin(alpha) * x;
+
+        *((__global float *)((__global char *)dst1 + dst1_index)) = a;
+        *((__global float *)((__global char *)dst2 + dst2_index)) = b;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, int src1_offset,//magnitue
+                                         __global double *src2, int src2_step, int src2_offset,//angle
+                                         __global double *dst1, int dst1_step, int dst1_offset, 
+                                         __global double *dst2, int dst2_step, int dst2_offset, 
+                                         int rows, int cols, int angInDegree)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
+
+        int dst1_index = mad24(y, dst1_step, (x << 3) + dst1_offset);
+        int dst2_index = mad24(y, dst2_step, (x << 3) + dst2_offset);
+
+        double x = *((__global double *)((__global char *)src1 + src1_index));
+        double y = *((__global double *)((__global char *)src2 + src2_index));
+
+        float ascale = CV_PI/180.0;
+        double alpha  = angInDegree == 1 ? y * ascale : y;
+        double a = cos(alpha) * x; 
+        double b = sin(alpha) * x;
+
+        *((__global double *)((__global char *)dst1 + dst1_index)) = a;
+        *((__global double *)((__global char *)dst2 + dst2_index)) = b;
+    }
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////polarToCart without magnitude//////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void arithm_polarToCart_D5 (__global float *src,  int src_step,  int src_offset,//angle
+                                     __global float *dst1, int dst1_step, int dst1_offset, 
+                                     __global float *dst2, int dst2_step, int dst2_offset, 
+                                     int rows, int cols, int angInDegree)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index  = mad24(y, src_step,  (x << 2) + src_offset);
+
+        int dst1_index = mad24(y, dst1_step, (x << 2) + dst1_offset);
+        int dst2_index = mad24(y, dst2_step, (x << 2) + dst2_offset);
+
+        float y = *((__global float *)((__global char *)src + src_index));
+
+        float ascale = CV_PI/180.0;
+        float alpha  = angInDegree == 1 ? y * ascale : y;
+        float a = cos(alpha); 
+        float b = sin(alpha);
+
+        *((__global float *)((__global char *)dst1 + dst1_index)) = a;
+        *((__global float *)((__global char *)dst2 + dst2_index)) = b;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_polarToCart_D6 (__global float *src,  int src_step,  int src_offset,//angle
+                                     __global float *dst1, int dst1_step, int dst1_offset, 
+                                     __global float *dst2, int dst2_step, int dst2_offset, 
+                                     int rows, int cols, int angInDegree)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src_index  = mad24(y, src_step,  (x << 3) + src_offset);
+
+        int dst1_index = mad24(y, dst1_step, (x << 3) + dst1_offset);
+        int dst2_index = mad24(y, dst2_step, (x << 3) + dst2_offset);
+
+        double y = *((__global double *)((__global char *)src + src_index));
+
+        float ascale = CV_PI/180.0;
+        double alpha  = angInDegree == 1 ? y * ascale : y;
+        double a = cos(alpha); 
+        double b = sin(alpha);
+
+        *((__global double *)((__global char *)dst1 + dst1_index)) = a;
+        *((__global double *)((__global char *)dst2 + dst2_index)) = b;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_pow.cl
+++ b/modules/ocl/src/kernels/arithm_pow.cl
@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+/************************************** pow **************************************/
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_pow_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1,
+                             double p)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float src1_data = *((__global float *)((__global char *)src1 + src1_index));
+        float tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data))));
+        
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+
+}
+#endif
+
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_pow_D6 (__global double *src1, int src1_step, int src1_offset,
+                             __global double *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1,
+                             double p)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double src1_data = *((__global double *)((__global char *)src1 + src1_index));
+        double tmp = src1_data > 0 ? exp(p * log(src1_data)) : (src1_data == 0 ? 0 : exp(p * log(fabs(src1_data)))); 
+        *((__global double *)((__global char *)dst + dst_index)) = tmp;
+    }
+
+}
+#endif
+
--- a/modules/ocl/src/kernels/arithm_sub.cl
+++ b/modules/ocl/src/kernels/arithm_sub.cl
--- a/modules/ocl/src/kernels/arithm_sub_scalar.cl
+++ b/modules/ocl/src/kernels/arithm_sub_scalar.cl
@ -0,0 +1,806 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+/**************************************sub with scalar without mask**************************************/
+__kernel void arithm_s_sub_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp = convert_int4_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        int2    tmp = convert_int2_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        ushort2 tmp_data = convert_ushort2_sat(tmp);
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+
+        int2   tmp = convert_int2_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        short2 tmp_data = convert_short2_sat(tmp);
+
+        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
+        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+
+        long tmp = (long)src_data1 - (long)src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        int data = convert_int_sat(tmp);
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float src_data2 = src2.x;
+
+        float tmp = src_data1 - src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double src2_data = src2.x;
+
+        double data = src_data1 - src2_data;
+        data = isMatSubScalar ? data : -data;
+
+        *((__global double *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+
+__kernel void arithm_s_sub_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp = convert_int4_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
+        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        int2 src_data2    = (int2)(src2.x, src2.y);
+        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) - src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        ushort2 data = convert_ushort2_sat(tmp);
+
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) - src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        short2 data = convert_short2_sat(tmp);
+
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y);
+        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
+
+        long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
+        tmp = isMatSubScalar ? tmp : -tmp;
+        int2 data = convert_int2_sat(tmp);
+
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
+        float2 src_data2 = (float2)(src2.x, src2.y);
+        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
+
+        float2 tmp = src_data1 - src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+
+        *((__global float2 *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
+        double2 src_data2 = (double2)(src2.x, src2.y);
+        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
+
+        double2 data = src_data1 - src_data2;
+        data = isMatSubScalar ? data : -data;
+
+        *((__global double2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+__kernel void arithm_s_sub_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
+        int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
+        int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
+        uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
+        uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
+
+        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+__kernel void arithm_s_sub_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
+        int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
+        int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
+        ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
+        ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_sub_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y);
+        int2 src2_data_1 = (int2)(src2.z, src2.x);
+        int2 src2_data_2 = (int2)(src2.y, src2.z);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
+        int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
+        int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        short2 tmp_data_0 = convert_short2_sat(tmp_0);
+        short2 tmp_data_1 = convert_short2_sat(tmp_1);
+        short2 tmp_data_2 = convert_short2_sat(tmp_2);
+
+        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_sub_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x;
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z;
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
+        long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
+        long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        int tmp_data_0 = convert_int_sat(tmp_0);
+        int tmp_data_1 = convert_int_sat(tmp_1);
+        int tmp_data_2 = convert_int_sat(tmp_2);
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+    }
+}
+__kernel void arithm_s_sub_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
+        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
+        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
+                                             
+        float src2_data_0 = src2.x;
+        float src2_data_1 = src2.y;
+        float src2_data_2 = src2.z;
+
+        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
+        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
+        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
+
+        float tmp_0 = src1_data_0 - src2_data_0;
+        float tmp_1 = src1_data_1 - src2_data_1;
+        float tmp_2 = src1_data_2 - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_0;
+       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_1;
+       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
+        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
+        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
+                                               
+        double src2_data_0 = src2.x;
+        double src2_data_1 = src2.y;
+        double src2_data_2 = src2.z;
+
+        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
+        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
+        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
+
+        double tmp_data_0 = src1_data_0 - src2_data_0;
+        double tmp_data_1 = src1_data_1 - src2_data_1;
+        double tmp_data_2 = src1_data_2 - src2_data_2;
+
+        tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
+        tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
+        tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
+
+       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+    }
+}
+#endif
+__kernel void arithm_s_sub_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                  __global   uchar *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+
+        int4 tmp = convert_int4_sat(src_data1) - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        uchar4 data = convert_uchar4_sat(tmp);
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                  __global   ushort *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+
+        int4 tmp = convert_int4_sat(src_data1) - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        ushort4 data = convert_ushort4_sat(tmp);
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                  __global   short *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+
+        int4 tmp = convert_int4_sat(src_data1) - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        short4 data = convert_short4_sat(tmp);
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                  __global   int *dst,  int dst_step,  int dst_offset,
+                                  int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+
+        long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2);
+        tmp = isMatSubScalar ? tmp : -tmp;
+        int4 data = convert_int4_sat(tmp);
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                  __global   float *dst,  int dst_step,  int dst_offset,
+                                  float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
+
+        float4 tmp = src_data1 - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                  __global   double *dst,  int dst_step,  int dst_offset,
+                                  double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
+
+        double4 data = src_data1 - src2;
+        data = isMatSubScalar ? data : -data;
+
+        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl
+++ b/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl
@ -0,0 +1,941 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+/**************************************sub with scalar with mask**************************************/
+__kernel void arithm_s_sub_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (dst_offset & 3)
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
+        uchar4 mask_data = vload4(0, mask + mask_index);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4 tmp = convert_int4_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
+        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
+        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
+        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar  *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        uchar2  mask_data = vload2(0, mask + mask_index);
+
+        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
+        int2    tmp = convert_int2_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        ushort2 tmp_data = convert_ushort2_sat(tmp);
+
+        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
+        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
+
+        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
+        int2 src2_data = (int2)(src2.x, src2.x);
+        uchar2  mask_data = vload2(0, mask + mask_index);
+
+        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
+        int2    tmp = convert_int2_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        short2 tmp_data = convert_short2_sat(tmp);
+
+        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
+        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
+
+        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
+                                            __global   int   *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
+        int src_data2 = src2.x;
+        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
+
+        long tmp = (long)src_data1 - (long)src_data2;
+        tmp = isMatSubScalar ? tmp : - tmp;
+        int data = convert_int_sat(tmp);
+        data = mask_data ? data : dst_data; 
+
+        *((__global int *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+__kernel void arithm_s_sub_with_mask_C1_D5 (__global   float   *src1, int src1_step, int src1_offset,
+                                            __global   float   *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float src_data2 = src2.x;
+        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
+
+        float data = src_data1 - src_data2;
+        data = isMatSubScalar ? data : -data;
+        data = mask_data ? data : dst_data; 
+
+        *((__global float *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_with_mask_C1_D6 (__global   double   *src1, int src1_step, int src1_offset,
+                                            __global   double   *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
+        double src_data2 = src2.x;
+        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
+
+        double data = src_data1 - src_data2;
+        data = isMatSubScalar ? data : -data;
+        data = mask_data ? data : dst_data; 
+
+        *((__global double *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+__kernel void arithm_s_sub_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align ((dst_offset >> 1) & 1)
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
+
+        uchar4 src1_data = vload4(0, src1 + src1_index);
+        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
+        uchar2 mask_data = vload2(0, mask + mask_index);
+
+        uchar4 data = *((__global uchar4 *)(dst + dst_index));
+        int4   tmp = convert_int4_sat(src1_data) - src2_data;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        uchar4 tmp_data = convert_uchar4_sat(tmp);
+
+        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
+        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y); 
+        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) - src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        ushort2 data = convert_ushort2_sat(tmp);
+        data = mask_data ? data : dst_data; 
+
+        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y); 
+        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
+
+        int2    tmp = convert_int2_sat(src_data1) - src_data2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        short2 data = convert_short2_sat(tmp);
+        data = mask_data ? data : dst_data; 
+
+        *((__global short2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                            __global   int *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
+        int2 src_data2 = (int2)(src2.x, src2.y); 
+        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
+
+        long2 tmp = convert_long2_sat(src_data1) - convert_long2_sat(src_data2);
+        tmp = isMatSubScalar ? tmp : -tmp;
+        int2 data = convert_int2_sat(tmp);
+        data = mask_data ? data : dst_data; 
+
+        *((__global int2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                            __global   float *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
+        float2 src_data2 = (float2)(src2.x, src2.y); 
+        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
+
+        float2 data = src_data1 - src_data2;
+        data = isMatSubScalar ? data : -data;
+        data = mask_data ? data : dst_data; 
+
+        *((__global float2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_with_mask_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                            __global   double *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
+        double2 src_data2 = (double2)(src2.x, src2.y); 
+        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
+
+        double2 data = src_data1 - src_data2;
+        data = isMatSubScalar ? data : -data;
+        data = mask_data ? data : dst_data; 
+
+        *((__global double2 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
+__kernel void arithm_s_sub_with_mask_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 2;
+
+        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
+
+        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
+        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
+        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
+
+        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); 
+        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
+        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); 
+
+        uchar4 mask_data = vload4(0, mask + mask_index);
+
+        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
+        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
+        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
+
+        int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
+        int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
+        int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
+        uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
+        uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
+
+        data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
+        data_0.w   = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_0.w : data_0.w;
+
+        data_1.xy  = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) 
+                     ? tmp_data_1.xy : data_1.xy;
+        data_1.zw  = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.zw : data_1.zw;
+
+        data_2.x   = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.x : data_2.x;
+        data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) 
+                     ? tmp_data_2.yzw : data_2.yzw;
+
+        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
+        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
+        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
+        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
+        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y); 
+        int2 src2_data_1 = (int2)(src2.z, src2.x); 
+        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+
+        uchar2 mask_data = vload2(0, mask + mask_index);
+
+        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
+        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
+        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
+
+        int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
+        int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
+        int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
+        ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
+        ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
+
+        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        x = x << 1;
+
+        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); 
+        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+
+        int dst_start  = mad24(y, dst_step, dst_offset);
+        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
+
+        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
+        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
+        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
+
+        int2 src2_data_0 = (int2)(src2.x, src2.y); 
+        int2 src2_data_1 = (int2)(src2.z, src2.x); 
+        int2 src2_data_2 = (int2)(src2.y, src2.z); 
+
+        uchar2 mask_data = vload2(0, mask + mask_index);
+
+        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
+        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
+        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
+
+        int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
+        int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
+        int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
+
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        short2 tmp_data_0 = convert_short2_sat(tmp_0);
+        short2 tmp_data_1 = convert_short2_sat(tmp_1);
+        short2 tmp_data_2 = convert_short2_sat(tmp_2);
+
+        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
+
+        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) 
+                     ? tmp_data_1.x : data_1.x;
+        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_1.y : data_1.y;
+
+        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) 
+                     ? tmp_data_2.xy : data_2.xy;
+
+       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                            __global   int *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
+        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
+        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
+
+        int src2_data_0 = src2.x; 
+        int src2_data_1 = src2.y;
+        int src2_data_2 = src2.z; 
+
+        uchar mask_data = * (mask + mask_index);
+
+        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
+        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
+        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
+
+        long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
+        long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
+        long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
+        
+        tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
+        tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
+        tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
+
+        int tmp_data_0 = convert_int_sat(tmp_0);
+        int tmp_data_1 = convert_int_sat(tmp_1);
+        int tmp_data_2 = convert_int_sat(tmp_2);
+
+        data_0 = mask_data ? tmp_data_0 : data_0;
+        data_1 = mask_data ? tmp_data_1 : data_1;
+        data_2 = mask_data ? tmp_data_2 : data_2;
+
+       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                            __global   float *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); 
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
+
+        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
+        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
+        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
+                                             
+        float src2_data_0 = src2.x; 
+        float src2_data_1 = src2.y;
+        float src2_data_2 = src2.z; 
+
+        uchar mask_data = * (mask + mask_index);
+
+        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
+        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
+        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
+
+        float tmp_data_0 = src1_data_0 - src2_data_0;
+        float tmp_data_1 = src1_data_1 - src2_data_1;
+        float tmp_data_2 = src1_data_2 - src2_data_2;
+
+        tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
+        tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
+        tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
+
+        data_0 = mask_data ? tmp_data_0 : data_0;
+        data_1 = mask_data ? tmp_data_1 : data_1;
+        data_2 = mask_data ? tmp_data_2 : data_2;
+
+       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_with_mask_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                            __global   double *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar  *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); 
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
+
+        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
+        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
+        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
+                                               
+        double src2_data_0 = src2.x; 
+        double src2_data_1 = src2.y;
+        double src2_data_2 = src2.z; 
+
+        uchar mask_data = * (mask + mask_index);
+
+        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
+        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
+        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
+
+        double tmp_data_0 = src1_data_0 - src2_data_0;
+        double tmp_data_1 = src1_data_1 - src2_data_1;
+        double tmp_data_2 = src1_data_2 - src2_data_2;
+
+        tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
+        tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
+        tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
+
+        data_0 = mask_data ? tmp_data_0 : data_0;
+        data_1 = mask_data ? tmp_data_1 : data_1;
+        data_2 = mask_data ? tmp_data_2 : data_2;
+
+       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+    }
+}
+#endif
+__kernel void arithm_s_sub_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
+                                            __global   uchar *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
+        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+
+        int4 tmp = convert_int4_sat(src_data1) - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        uchar4 data = convert_uchar4_sat(tmp);
+
+        data = mask_data ? data : dst_data; 
+
+        *((__global uchar4 *)(dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
+                                            __global   ushort *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
+        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
+
+        int4    tmp = convert_int4_sat(src_data1) - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        ushort4 data = convert_ushort4_sat(tmp);
+
+        data = mask_data ? data : dst_data; 
+
+        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
+                                            __global   short *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
+        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
+
+        int4    tmp = convert_int4_sat(src_data1) - src2;
+        tmp = isMatSubScalar ? tmp : -tmp;
+        short4 data = convert_short4_sat(tmp);
+
+        data = mask_data ? data : dst_data; 
+
+        *((__global short4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
+                                            __global   int *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
+        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
+
+        long4 tmp = convert_long4_sat(src_data1) - convert_long4_sat(src2);
+        tmp = isMatSubScalar ? tmp : -tmp;
+        int4 data = convert_int4_sat(tmp);
+
+        data = mask_data ? data : dst_data; 
+
+        *((__global int4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+__kernel void arithm_s_sub_with_mask_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
+                                            __global   float *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
+        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
+
+        float4 data = src_data1 - src2;
+        data = isMatSubScalar ? data : -data;
+
+        data = mask_data ? data : dst_data; 
+
+        *((__global float4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+
+#if defined (DOUBLE_SUPPORT)
+__kernel void arithm_s_sub_with_mask_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
+                                            __global   double *dst,  int dst_step,  int dst_offset,
+                                            __global   uchar *mask, int mask_step, int mask_offset,
+                                            double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
+{
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
+        int mask_index = mad24(y, mask_step,  x       + mask_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
+
+        uchar mask_data = *(mask + mask_index);
+
+        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
+        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
+
+        double4 data = src_data1 - src2;
+        data = isMatSubScalar ? data : -data;
+        data = mask_data ? data : dst_data; 
+
+        *((__global double4 *)((__global char *)dst + dst_index)) = data;
+    }
+}
+#endif
--- a/modules/ocl/src/kernels/arithm_sum.cl
+++ b/modules/ocl/src/kernels/arithm_sum.cl
@ -0,0 +1,206 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define RES_TYPE double8
+#define CONVERT_RES_TYPE convert_double8
+#else
+#define RES_TYPE float8
+#define CONVERT_RES_TYPE convert_float8
+#endif
+
+#if defined (DEPTH_0)
+#define VEC_TYPE uchar8
+#endif
+#if defined (DEPTH_1)
+#define VEC_TYPE char8
+#endif
+#if defined (DEPTH_2)
+#define VEC_TYPE ushort8
+#endif
+#if defined (DEPTH_3)
+#define VEC_TYPE short8
+#endif
+#if defined (DEPTH_4)
+#define VEC_TYPE int8
+#endif
+#if defined (DEPTH_5)
+#define VEC_TYPE float8
+#endif
+#if defined (DEPTH_6)
+#define VEC_TYPE double8
+#endif
+
+#if defined (FUNC_TYPE_0)
+#define FUNC(a,b) b += a;
+#endif
+#if defined (FUNC_TYPE_1)
+#define FUNC(a,b) b = b + (a >= 0 ? a : -a);
+#endif
+#if defined (FUNC_TYPE_2)
+#define FUNC(a,b) b = b + a * a;
+#endif
+
+#if defined (REPEAT_S0)
+#define repeat_s(a) a = a;
+#endif
+#if defined (REPEAT_S1)
+#define repeat_s(a) a.s0 = 0;
+#endif
+#if defined (REPEAT_S2)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;
+#endif
+#if defined (REPEAT_S3)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_S4)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_S5)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_S6)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_S7)
+#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_e(a) a = a;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_e(a) a.s7 = 0;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
+
+/**************************************Array buffer SUM**************************************/
+__kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                __global VEC_TYPE *src, __global RES_TYPE *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int  id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   __local RES_TYPE localmem_sum[128];
+   RES_TYPE sum = 0,temp;
+   if(id < elemnum)
+   {
+       temp = CONVERT_RES_TYPE(src[idx]);
+       if(id % cols == 0 ) 
+       {
+           repeat_s(temp);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_e(temp);
+       }
+       FUNC(temp,sum);
+   }
+   else
+   {
+       sum = 0;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = CONVERT_RES_TYPE(src[idx]);
+       if(id % cols == 0 ) 
+       {
+               repeat_s(temp);
+       }
+       if(id % cols == cols - 1)
+       {
+               repeat_e(temp);
+       }
+       FUNC(temp,sum);
+   }
+   if(lid > 127)
+   {
+       localmem_sum[lid - 128] = sum;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_sum[lid] = sum + localmem_sum[lid];
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid] = localmem_sum[0];
+   }
+}
+
--- a/modules/ocl/src/kernels/arithm_sum_3.cl
+++ b/modules/ocl/src/kernels/arithm_sum_3.cl
@ -0,0 +1,248 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define RES_TYPE double4
+#define CONVERT_RES_TYPE convert_double4
+#else
+#define RES_TYPE float4
+#define CONVERT_RES_TYPE convert_float4
+#endif
+
+#if defined (DEPTH_0)
+#define VEC_TYPE uchar4
+#endif
+#if defined (DEPTH_1)
+#define VEC_TYPE char4
+#endif
+#if defined (DEPTH_2)
+#define VEC_TYPE ushort4
+#endif
+#if defined (DEPTH_3)
+#define VEC_TYPE short4
+#endif
+#if defined (DEPTH_4)
+#define VEC_TYPE int4
+#endif
+#if defined (DEPTH_5)
+#define VEC_TYPE float4
+#endif
+#if defined (DEPTH_6)
+#define VEC_TYPE double4
+#endif
+
+#if defined (FUNC_TYPE_0)
+#define FUNC(a,b) b += a;
+#endif
+#if defined (FUNC_TYPE_1)
+#define FUNC(a,b) b = b + (a >= 0 ? a : -a);
+#endif
+#if defined (FUNC_TYPE_2)
+#define FUNC(a,b) b = b + a * a;
+#endif
+
+#if defined (REPEAT_S0)
+#define repeat_s(a,b,c) a=a; b =b; c=c;
+#endif
+#if defined (REPEAT_S1)
+#define repeat_s(a,b,c) a.s0=0; b=b; c=c;
+#endif
+#if defined (REPEAT_S2)
+#define repeat_s(a,b,c) a.s0=0; a.s1=0; b=b; c=c;
+#endif
+#if defined (REPEAT_S3)
+#define repeat_s(a,b,c) a.s0=0; a.s1=0; a.s2=0; b=b; c=c;
+#endif
+#if defined (REPEAT_S4)
+#define repeat_s(a,b,c) a=0;b=b; c=c;
+#endif
+#if defined (REPEAT_S5)
+#define repeat_s(a,b,c) a=0; b.s0=0;c=c;
+#endif
+#if defined (REPEAT_S6)
+#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; c=c;
+#endif
+#if defined (REPEAT_S7)
+#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; b.s2=0; c=c;
+#endif
+#if defined (REPEAT_S8)
+#define repeat_s(a,b,c) a=0; b=0; c=c;
+#endif
+#if defined (REPEAT_S9)
+#define repeat_s(a,b,c) a=0; b=0; c.s0=0;
+#endif
+#if defined (REPEAT_S10)
+#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0;
+#endif
+#if defined (REPEAT_S11)
+#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0; c.s2=0;
+#endif
+
+#if defined (REPEAT_E0)
+#define repeat_e(a,b,c) a=a; b =b; c=c;
+#endif
+#if defined (REPEAT_E1)
+#define repeat_e(a,b,c) a=a; b=b; c.s3=0;
+#endif
+#if defined (REPEAT_E2)
+#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0;
+#endif
+#if defined (REPEAT_E3)
+#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0; c.s1=0;
+#endif
+#if defined (REPEAT_E4)
+#define repeat_e(a,b,c) a=a; b=b; c=0;
+#endif
+#if defined (REPEAT_E5)
+#define repeat_e(a,b,c) a=a; b.s3=0; c=0;
+#endif
+#if defined (REPEAT_E6)
+#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; c=0;
+#endif
+#if defined (REPEAT_E7)
+#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; b.s1=0; c=0;
+#endif
+#if defined (REPEAT_E8)
+#define repeat_e(a,b,c) a=a; b=0; c=0;
+#endif
+#if defined (REPEAT_E9)
+#define repeat_e(a,b,c) a.s3=0; b=0; c=0;
+#endif
+#if defined (REPEAT_E10)
+#define repeat_e(a,b,c) a.s3=0; a.s2=0; b=0; c=0;
+#endif
+#if defined (REPEAT_E11)
+#define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0;
+#endif
+
+__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,  
+                                __global VEC_TYPE *src, __global RES_TYPE *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int id = get_global_id(0);
+   unsigned int idx = offset + id + (id  / cols) * invalid_cols;
+   idx = idx * 3;
+   __local RES_TYPE localmem_sum1[128];
+   __local RES_TYPE localmem_sum2[128];
+   __local RES_TYPE localmem_sum3[128];
+   RES_TYPE sum1 = 0,sum2 = 0,sum3 = 0,temp1,temp2,temp3;
+   if(id < elemnum)
+   {
+       temp1 = CONVERT_RES_TYPE(src[idx]);
+       temp2 = CONVERT_RES_TYPE(src[idx+1]);
+       temp3 = CONVERT_RES_TYPE(src[idx+2]);
+       if(id % cols == 0 ) 
+       {
+           repeat_s(temp1,temp2,temp3);
+       }
+       if(id % cols == cols - 1)
+       {
+           repeat_e(temp1,temp2,temp3);
+       }
+       FUNC(temp1,sum1);
+       FUNC(temp2,sum2);
+       FUNC(temp3,sum3);
+   }
+   else
+   {
+       sum1 = 0;
+       sum2 = 0;
+       sum3 = 0;
+   }
+   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       idx = idx * 3;
+       temp1 = CONVERT_RES_TYPE(src[idx]);
+       temp2 = CONVERT_RES_TYPE(src[idx+1]);
+       temp3 = CONVERT_RES_TYPE(src[idx+2]);
+       if(id % cols == 0 ) 
+       {
+               repeat_s(temp1,temp2,temp3);
+       }
+       if(id % cols == cols - 1)
+       {
+               repeat_e(temp1,temp2,temp3);
+       }
+       FUNC(temp1,sum1);
+       FUNC(temp2,sum2);
+       FUNC(temp3,sum3);
+   }
+   if(lid > 127)
+   {
+       localmem_sum1[lid - 128] = sum1;
+       localmem_sum2[lid - 128] = sum2;
+       localmem_sum3[lid - 128] = sum3;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if(lid < 128)
+   {
+       localmem_sum1[lid] = sum1 + localmem_sum1[lid];
+       localmem_sum2[lid] = sum2 + localmem_sum2[lid];
+       localmem_sum3[lid] = sum3 + localmem_sum3[lid];
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+   for(int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if(lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_sum1[lid] = localmem_sum1[lid] + localmem_sum1[lid2];
+           localmem_sum2[lid] = localmem_sum2[lid] + localmem_sum2[lid2];
+           localmem_sum3[lid] = localmem_sum3[lid] + localmem_sum3[lid2];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+   if( lid == 0)
+   {
+       dst[gid*3]   = localmem_sum1[0];
+       dst[gid*3+1] = localmem_sum2[0];
+       dst[gid*3+2] = localmem_sum3[0];
+   }
+}
+
--- a/modules/ocl/src/kernels/arithm_transpose.cl
+++ b/modules/ocl/src/kernels/arithm_transpose.cl
@ -0,0 +1,510 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define TILE_DIM      32 
+#define BLOCK_ROWS    8 
+#define LDS_STEP     (TILE_DIM + 1)
+
+
+//8UC1 is not unoptimized, as the size of write per thread is 8 
+//which will use completepath
+__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset, 
+                              __global uchar* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local uchar title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, x);
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] =*(src + src_offset + index_src);
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, x_index);
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
+
+__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset, 
+                              __global int* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local int title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, (x << 2));
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = *((__global int *)((__global char*)src + src_offset + index_src));
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, (x_index << 2));
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
+__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset, 
+                              __global float* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local float title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, (x << 2));
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = *((__global float *)((__global char*)src + src_offset + index_src));
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, (x_index << 2));
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
+
+__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset, 
+                              __global ushort* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local ushort2 title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, (x << 2));
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = *((__global ushort2 *)((__global char*)src + src_offset + index_src));
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, (x_index << 2));
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
+__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset, 
+                              __global short* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local short2 title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, (x << 2));
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = *((__global short2 *)((__global char*)src + src_offset + index_src));
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, (x_index << 2));
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
+__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset, 
+                              __global uchar* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local uchar4 title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, (x << 2));
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = *((__global uchar4 *)(src + src_offset + index_src));
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, (x_index << 2));
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
+
+__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset, 
+                              __global char* dst, int dst_step, int dst_offset,
+                              int src_rows, int src_cols)
+{
+
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;  
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y; 
+        groupId_y =  bid % gs_y;  
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local char4 title[TILE_DIM * LDS_STEP];
+
+    if(x < src_cols && y < src_rows)
+    {
+        int index_src = mad24(y, src_step, (x << 2));
+
+        #pragma unroll 
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if(y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = *((__global char4 *)(src + src_offset + index_src));
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+     }
+
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, (x_index << 2));
+
+        #pragma unroll
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if((y_index + i) < src_cols)
+            {
+                *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];  
+                index_dst +=  dst_step * BLOCK_ROWS ;
+            }
+        }
+    }
+}
--- a/modules/ocl/src/kernels/convertC3C4.cl
+++ b/modules/ocl/src/kernels/convertC3C4.cl
@ -0,0 +1,137 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zero Lin, zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+__kernel void convertC3C4_D0(__global const char4 * restrict src, __global char4 *dst, int cols, int rows, 
+					int srcStep, int dstStep)
+{
+	int id = get_global_id(0);
+	int y = id / cols;
+	int x = id % cols;
+
+	int d = y * srcStep + x * 3;
+	char8 data = (char8)(src[d>>2], src[(d>>2) + 1]);
+	char temp[8] = {data.s0, data.s1, data.s2, data.s3, data.s4, data.s5, data.s6, data.s7};
+	
+	int start = d & 3;
+	char4 ndata = (char4)(temp[start], temp[start + 1], temp[start + 2], 0);
+	if(y < rows)
+		dst[y * dstStep + x] = ndata;
+}
+
+__kernel void convertC3C4_D1(__global const short* restrict src, __global short4 *dst, int cols, int rows, 
+					int srcStep, int dstStep)
+{
+	int id = get_global_id(0);
+	int y = id / cols;
+	int x = id % cols;
+
+	int d = (y * srcStep + x * 6)>>1;
+	short4 data = *(__global short4 *)(src + ((d>>1)<<1));
+	short temp[4] = {data.s0, data.s1, data.s2, data.s3};
+	
+	int start = d & 1;
+	short4 ndata = (short4)(temp[start], temp[start + 1], temp[start + 2], 0);
+	if(y < rows)
+		dst[y * dstStep + x] = ndata;
+}
+
+__kernel void convertC3C4_D2(__global const int * restrict src, __global int4 *dst, int cols, int rows, 
+					int srcStep, int dstStep)
+{
+	int id = get_global_id(0);
+	int y = id / cols;
+	int x = id % cols;
+
+	int d = (y * srcStep + x * 12)>>2;
+	int4 data = *(__global int4 *)(src + d);
+	data.z = 0;
+	
+	if(y < rows)
+		dst[y * dstStep + x] = data;
+}
+
+__kernel void convertC4C3_D2(__global const int4 * restrict src, __global int *dst, int cols, int rows, 
+					int srcStep, int dstStep)
+{
+	int id = get_global_id(0);
+	int y = id / cols;
+	int x = id % cols;
+
+	int4 data = src[y * srcStep + x];
+	
+	if(y < rows)
+	{
+		int d = y * dstStep + x * 3;
+		dst[d] = data.x;
+		dst[d + 1] = data.y;
+		dst[d + 2] = data.z;
+	}
+}
+
+__kernel void convertC4C3_D1(__global const short4 * restrict src, __global short *dst, int cols, int rows, 
+					int srcStep, int dstStep)
+{
+	int id = get_global_id(0);
+	int y = id / cols;
+	int x = id % cols;
+
+	short4 data = src[y * srcStep + x];
+	
+	if(y < rows)
+	{
+		int d = y * dstStep + x * 3;
+		dst[d] = data.x;
+		dst[d + 1] = data.y;
+		dst[d + 2] = data.z;
+	}
+}
+
+__kernel void convertC4C3_D0(__global const char4 * restrict src, __global char *dst, int cols, int rows, 
+					int srcStep, int dstStep)
+{
+	int id = get_global_id(0);
+	int y = id / cols;
+	int x = id % cols;
+
+	char4 data = src[y * srcStep + x];
+	
+	if(y < rows)
+	{
+		int d = y * dstStep + x * 3;
+		dst[d] = data.x;
+		dst[d + 1] = data.y;
+		dst[d + 2] = data.z;
+	}
+}
--- a/modules/ocl/src/kernels/cvt_color.cl
+++ b/modules/ocl/src/kernels/cvt_color.cl
@ -0,0 +1,81 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if defined (DEPTH_0)
+#define DATA_TYPE uchar
+#endif
+#if defined (DEPTH_2)
+#define DATA_TYPE ushort
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+enum
+{
+    yuv_shift  = 14,
+    xyz_shift  = 12,
+    R2Y        = 4899,
+    G2Y        = 9617,
+    B2Y        = 1868,
+    BLOCK_SIZE = 256
+};
+
+__kernel void RGB2Gray(int cols,int rows,int src_step,int dst_step,int channels,
+                       int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int src_idx = y * src_step + x * channels * sizeof(DATA_TYPE);
+        int dst_idx = y * dst_step + x * sizeof(DATA_TYPE);
+        dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
+    }
+}   
--- a/modules/ocl/src/kernels/filter_sep_col.cl
+++ b/modules/ocl/src/kernels/filter_sep_col.cl
@ -0,0 +1,310 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
+#define RADIUS 1
+#if CN ==1
+#define ALIGN (((RADIUS)+3)>>2<<2)
+#elif CN==2
+#define ALIGN (((RADIUS)+1)>>1<<1)
+#elif CN==3
+#define ALIGN (((RADIUS)+3)>>2<<2)
+#elif CN==4
+#define ALIGN (RADIUS)
+#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0)
+#endif
+
+#ifdef BORDER_CONSTANT
+//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
+#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+#endif
+
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i,l_edge,r_edge)  (i) < (l_edge) ? (l_edge) : (i)
+#define ADDR_R(i,r_edge,addr)   (i) >= (r_edge) ? (r_edge)-1 : (addr)
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i,l_edge,r_edge)  (i) < (l_edge) ? -(i)-1 : (i)
+#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i,l_edge,r_edge)  (i) < (l_edge) ? -(i) : (i)
+#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i,l_edge,r_edge)  (i) < (l_edge) ? (i)+(r_edge) : (i)
+#define ADDR_R(i,r_edge,addr)   (i) >= (r_edge) ?   (i)-(r_edge) : (addr)
+#endif
+
+
+/**********************************************************************************
+These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
+Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
+kernel must be in the center. ROI is not supported either.
+Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
+from LDS to calculate the result.
+The length of the convovle kernel supported is only related to the MAX size of LDS, 
+which is HW related.
+Niko
+6/29/2011
+***********************************************************************************/
+
+
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D0
+						(__global const float * restrict src, 
+						 __global uchar * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         //const int src_offset_x, 
+                         //const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int dst_offset_in_pixel,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_addr = mad24(y,src_step_in_pixel,x);
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	int i;
+	float sum;
+	float temp[READ_TIMES_COL];
+
+	__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
+
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+		current_addr = current_addr < end_addr ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//read pixels from lds and calculate the result
+	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
+	for(i=1;i<=RADIUSY;i++)
+	{
+		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
+		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
+		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+		dst[start_addr] = convert_uchar_sat(sum);
+	}
+}
+
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D0
+						(__global const float4 * restrict src, 
+						 __global uchar4 * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         //const int src_offset_x, 
+                         //const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int dst_offset_in_pixel,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_addr = mad24(y,src_step_in_pixel,x);
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	int i;
+	float4 sum;
+	float4 temp[READ_TIMES_COL];
+
+	__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
+
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+		current_addr = current_addr < end_addr ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//read pixels from lds and calculate the result
+	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
+	for(i=1;i<=RADIUSY;i++)
+	{
+		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
+		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
+		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+		dst[start_addr] = convert_uchar4_sat(sum);
+	}
+}
+
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C1_D5
+						(__global const float * restrict src, 
+						 __global float * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         //const int src_offset_x, 
+                         //const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int dst_offset_in_pixel,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_addr = mad24(y,src_step_in_pixel,x);
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	int i;
+	float sum;
+	float temp[READ_TIMES_COL];
+
+	__local float LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
+
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+		current_addr = current_addr < end_addr ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//read pixels from lds and calculate the result
+	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
+	for(i=1;i<=RADIUSY;i++)
+	{
+		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
+		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
+		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+		dst[start_addr] = sum;
+	}
+}
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter_C4_D5
+						(__global const float4 * restrict src, 
+						 __global float4 * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         //const int src_offset_x, 
+                         //const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int dst_offset_in_pixel,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_addr = mad24(y,src_step_in_pixel,x);
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	int i;
+	float4 sum;
+	float4 temp[READ_TIMES_COL];
+
+	__local float4 LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1];
+
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+		current_addr = current_addr < end_addr ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_COL;i++)
+	{
+		LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//read pixels from lds and calculate the result
+	sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
+	for(i=1;i<=RADIUSY;i++)
+	{
+		temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
+		temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
+		sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+		dst[start_addr] = sum;
+	}
+}
--- a/modules/ocl/src/kernels/filter_sep_row.cl
+++ b/modules/ocl/src/kernels/filter_sep_row.cl
@ -0,0 +1,469 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only
+#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+#define RADIUS 1
+#if CN ==1
+#define ALIGN (((RADIUS)+3)>>2<<2)
+#elif CN==2
+#define ALIGN (((RADIUS)+1)>>1<<1)
+#elif CN==3
+#define ALIGN (((RADIUS)+3)>>2<<2)
+#elif CN==4
+#define ALIGN (RADIUS)
+#endif
+
+
+#ifdef BORDER_CONSTANT
+//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
+#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+#endif
+
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i,l_edge,r_edge,addr)  (i) < (l_edge) ? (l_edge) : (addr)
+#define ADDR_R(i,r_edge,addr)   (i) >= (r_edge) ? (r_edge)-1 : (addr)
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i,l_edge,r_edge,addr)  (i) < (l_edge) ? -(i)-1 : (addr)
+#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i,l_edge,r_edge,addr)  (i) < (l_edge) ? -(i) : (addr)
+#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i,l_edge,r_edge,addr)  (i) < (l_edge) ? (i)+(r_edge) : (addr)
+#define ADDR_R(i,r_edge,addr)   (i) >= (r_edge) ?   (i)-(r_edge) : (addr)
+#endif
+
+/**********************************************************************************
+These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
+Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
+kernel must be in the center. ROI is not supported either.
+For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
+the kernel read 4 pixels, save them to LDS and read the data needed from LDS to 
+calculate the result.
+The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
+of LDS, which is HW related.
+For channels = 1,3 the RADIUS is no more than LSIZE0*2
+For channels = 2, the RADIUS is no more than LSIZE0
+For channels = 4, arbitary RADIUS is supported unless the LDS is not enough
+Niko
+6/29/2011
+***********************************************************************************/
+
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
+						(__global const uchar * restrict src, 
+						 __global float * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         const int src_offset_x, 
+                         const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int radiusy,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+{
+	int x = get_global_id(0)<<2;
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
+	int offset = src_offset_x-RADIUSX & 3;
+	int start_y = y+src_offset_y-radiusy;
+	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+	int i;
+	float4 sum;
+	uchar4 temp[READ_TIMES_ROW];
+
+	__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+	#ifdef BORDER_CONSTANT
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		int current_addr = start_addr+i*LSIZE0*4;
+		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+		temp[i] = *(__global uchar4*)&src[current_addr];
+	}
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i].x= ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
+		temp[i].y= ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
+		temp[i].z= ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
+		temp[i].w= ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
+		temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
+	}
+	#else
+	int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
+	int4 index[READ_TIMES_ROW];
+	int4 addr;
+	int s_y;
+	if(not_all_in_range)
+	{
+		//judge if read out of boundary
+		for(i = 0;i<READ_TIMES_ROW;i++)
+		{
+			index[i].x= ADDR_L(start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4);
+			index[i].x= ADDR_R(start_x+i*LSIZE0*4,src_whole_cols,index[i].x);
+			index[i].y= ADDR_L(start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1);
+			index[i].y= ADDR_R(start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y);
+			index[i].z= ADDR_L(start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2);
+			index[i].z= ADDR_R(start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z);
+			index[i].w= ADDR_L(start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3);
+			index[i].w= ADDR_R(start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w);
+		}
+		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+		s_y= ADDR_R(start_y,src_whole_rows,s_y);
+		//read pixels from src
+		for(i = 0;i<READ_TIMES_ROW;i++)
+		{
+			addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
+			temp[i].x = src[addr.x];
+			temp[i].y = src[addr.y];
+			temp[i].z = src[addr.z];
+			temp[i].w = src[addr.w];
+		}
+	}
+	else
+	{
+		//read pixels from src
+		for(i = 0;i<READ_TIMES_ROW;i++)
+		{
+			temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
+		}	
+	}	
+	#endif
+
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	//read pixels from lds and calculate the result
+	sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
+	for(i=1;i<=RADIUSX;i++)
+	{
+		temp[0]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset-i);
+		temp[1]=vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset+i);
+		sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
+	}
+	start_addr = mad24(y,dst_step_in_pixel,x);
+	//write the result to dst
+	if((x+3<dst_cols) & (y<dst_rows))
+	{
+		*(__global float4*)&dst[start_addr] = sum;
+	}
+	else if((x+2<dst_cols) & (y<dst_rows))
+	{
+		dst[start_addr] = sum.x;
+		dst[start_addr+1] = sum.y;
+		dst[start_addr+2] = sum.z;
+	}
+	else if((x+1<dst_cols) & (y<dst_rows))
+	{
+		dst[start_addr] = sum.x;
+		dst[start_addr+1] = sum.y;
+	}
+	else if((x<dst_cols) & (y<dst_rows))
+	{
+		dst[start_addr] = sum.x;
+	}
+}
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
+						(__global const uchar4 * restrict src, 
+						 __global float4 * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         const int src_offset_x, 
+                         const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int radiusy,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_x = x+src_offset_x-RADIUSX;
+	int start_y = y+src_offset_y-radiusy;
+	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+	int i;
+	float4 sum;
+	uchar4 temp[READ_TIMES_ROW];
+
+	__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+	#ifdef BORDER_CONSTANT
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		int current_addr = start_addr+i*LSIZE0;
+		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
+		temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
+	}
+	#else
+	int index[READ_TIMES_ROW];
+	int s_x,s_y;
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
+		s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
+		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+		s_y= ADDR_R(start_y,src_whole_rows,s_y);
+		index[i]=mad24(s_y,src_step_in_pixel,s_x);
+	}
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i] = src[index[i]];
+	}	
+	#endif
+
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	//read pixels from lds and calculate the result
+	sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
+	for(i=1;i<=RADIUSX;i++)
+	{
+		temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
+		temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
+		sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x);
+		dst[start_addr] = sum;
+	}
+}
+
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
+						(__global const float * restrict src, 
+						 __global float * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         const int src_offset_x, 
+                         const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int radiusy,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_x = x+src_offset_x-RADIUSX;
+	int start_y = y+src_offset_y-radiusy;
+	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+	int i;
+	float sum;
+	float temp[READ_TIMES_ROW];
+
+	__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+	#ifdef BORDER_CONSTANT
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		int current_addr = start_addr+i*LSIZE0;
+		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
+		temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
+	}
+	#else
+	int index[READ_TIMES_ROW];
+	int s_x,s_y;
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
+		s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
+		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+		s_y= ADDR_R(start_y,src_whole_rows,s_y);
+		index[i]=mad24(s_y,src_step_in_pixel,s_x);
+	}
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i] = src[index[i]];
+	}	
+	#endif
+
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	//read pixels from lds and calculate the result
+	sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
+	for(i=1;i<=RADIUSX;i++)
+	{
+		temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
+		temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
+		sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x);
+		dst[start_addr] = sum;
+	}
+}
+
+__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
+						(__global const float4 * restrict src, 
+						 __global float4 * dst,
+                         const int dst_cols,
+                         const int dst_rows, 
+						 const int src_whole_cols,
+						 const int src_whole_rows,
+                         const int src_step_in_pixel, 
+                         const int src_offset_x, 
+                         const int src_offset_y, 
+                         const int dst_step_in_pixel,
+                         const int radiusy,
+                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	int l_x = get_local_id(0);
+	int l_y = get_local_id(1);
+	int start_x = x+src_offset_x-RADIUSX;
+	int start_y = y+src_offset_y-radiusy;
+	int start_addr = mad24(start_y,src_step_in_pixel,start_x);
+	int i;
+	float4 sum;
+	float4 temp[READ_TIMES_ROW];
+
+	__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+	#ifdef BORDER_CONSTANT
+	int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		int current_addr = start_addr+i*LSIZE0;
+		current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+		temp[i] = src[current_addr];
+	}
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,0,temp[i]);
+		temp[i]= ELEM(start_y,0,src_whole_rows,0,temp[i]);
+	}
+	#else
+	int index[READ_TIMES_ROW];
+	int s_x,s_y;
+	//judge if read out of boundary
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		s_x= ADDR_L(start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0);
+		s_x= ADDR_R(start_x+i*LSIZE0,src_whole_cols,s_x);
+		s_y= ADDR_L(start_y,0,src_whole_rows,start_y);
+		s_y= ADDR_R(start_y,src_whole_rows,s_y);
+		index[i]=mad24(s_y,src_step_in_pixel,s_x);
+	}
+	//read pixels from src
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		temp[i] = src[index[i]];
+	}	
+	#endif
+
+	//save pixels to lds
+	for(i = 0;i<READ_TIMES_ROW;i++)
+	{
+		LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	//read pixels from lds and calculate the result
+	sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
+	for(i=1;i<=RADIUSX;i++)
+	{
+		temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
+		temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
+		sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
+	}
+	//write the result to dst
+	if((x<dst_cols) & (y<dst_rows))
+	{
+		start_addr = mad24(y,dst_step_in_pixel,x);
+		dst[start_addr] = sum;
+	}
+}
+
+
--- a/modules/ocl/src/kernels/filtering_boxFilter.cl
+++ b/modules/ocl/src/kernels/filtering_boxFilter.cl
@ -0,0 +1,458 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+//blur function does not support BORDER_WRAP
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
+                                     int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                                     int dst_offset, int dst_rows, int dst_cols, int dst_step
+                                     )
+{
+
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+    int src_x_off = src_offset % src_step;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = dst_offset % dst_step;
+    int dst_y_off = dst_offset / dst_step;
+
+    int head_off = dst_x_off%4;
+    int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+    
+    uint4 data[ksY+1];
+    __local uint4 temp[(THREADS<<1)];   
+        
+#ifdef BORDER_CONSTANT
+    
+        for(int i=0; i < ksY+1; i++)
+        {
+            if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
+                data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
+            else
+            {
+                data[i]=0;
+                int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
+                if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
+                con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
+                if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
+                con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
+                if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
+                con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
+                if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
+            }
+        }
+        
+#else
+   int not_all_in_range;
+   for(int i=0; i < ksY+1; i++)
+   {
+      not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1) 
+                        | (startY+i<0) | (startY+i>src_whole_rows-1);
+      if(not_all_in_range)
+      {   
+          int selected_row;
+          int4 selected_col;
+          selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+          selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+          selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
+          selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
+          
+          selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
+          selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
+          
+          selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
+          selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
+          
+          selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
+          selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
+
+          data[i].x = *(src + selected_row * src_step + selected_col.x);
+          data[i].y = *(src + selected_row * src_step + selected_col.y);
+          data[i].z = *(src + selected_row * src_step + selected_col.z);
+          data[i].w = *(src + selected_row * src_step + selected_col.w);
+      }
+      else
+      {
+          data[i] =  convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
+      }
+   }
+#endif
+    uint4 sum0 = 0, sum1 = 0, sum2 = 0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+
+    temp[col] = sum1;
+    temp[col+THREADS] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    if(col >= anX && col < (THREADS-ksX+anX+1))
+    {
+        int posX = dst_startX - dst_x_off + (col-anX)*4;
+        int posY = (gY << 1);
+        uint4 tmp_sum1=0, tmp_sum2=0;
+        for(int i=-anX; i<=anX; i++)
+        {
+           tmp_sum1 += vload4(col, (__local uint*)temp+i);
+        }
+
+        for(int i=-anX; i<=anX; i++)
+        {
+           tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i);
+        }
+       
+        if(posY < dst_rows && posX < dst_cols)
+        {
+           if(posX >= 0 && posX < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum1.x/alpha;
+           if(posX+1 >= 0 && posX+1 < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum1.y/alpha;
+           if(posX+2 >= 0 && posX+2 < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha;
+           if(posX+3 >= 0 && posX+3 < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha;
+        }   
+        if(posY+1 < dst_rows && posX < dst_cols)
+        {
+           dst_startY+=1;
+           if(posX >= 0 && posX < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum2.x/alpha;
+           if(posX+1 >= 0 && posX+1 < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum2.y/alpha;
+           if(posX+2 >= 0 && posX+2 < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha;
+           if(posX+3 >= 0 && posX+3 < dst_cols)
+               *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha;
+        }   
+    }
+        
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
+                                     int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                                     int dst_offset, int dst_rows, int dst_cols, int dst_step
+                                     )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 2;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;  
+	int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
+    uint4 data[ksY+1];
+    __local uint4 temp[2][THREADS];   
+#ifdef BORDER_CONSTANT
+    bool con;
+    uint4 ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+		int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
+        ss = convert_uint4(src[cur_addr]); 
+        data[i] = con ? ss : 0;
+    }
+#else
+   for(int i=0; i < ksY+1; i++)
+   {
+          int selected_row;
+          int selected_col;
+          selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+          selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+          
+          data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
+   }
+    
+#endif
+    uint4 sum0 = 0, sum1 = 0, sum2 = 0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+       
+        uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {    
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha);
+        }
+        
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
+                                     int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                                     int dst_offset, int dst_rows, int dst_cols, int dst_step
+                                     )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 2;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;  
+	int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
+    float data[ksY+1];
+    __local float temp[2][THREADS];   
+#ifdef BORDER_CONSTANT
+    bool con;
+    float ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+		int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);		
+        ss = src[cur_addr]; 
+        data[i] = con ? ss : 0.f;
+    }
+#else
+   for(int i=0; i < ksY+1; i++)
+   {
+          int selected_row;
+          int selected_col;
+          selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+          selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+          
+          data[i] = src[selected_row * (src_step>>2) + selected_col];
+   }
+    
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+       
+        float tmp_sum[2]={0.0, 0.0};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {    
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+        }
+        
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
+                                     int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                                     int dst_offset, int dst_rows, int dst_cols, int dst_step
+                                     )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 4;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 4;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;  
+	int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
+    float4 data[ksY+1];
+    __local float4 temp[2][THREADS];   
+#ifdef BORDER_CONSTANT
+    bool con;
+    float4 ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+		int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);		
+        ss = src[cur_addr]; 
+        data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
+    }
+#else
+   for(int i=0; i < ksY+1; i++)
+   {
+          int selected_row;
+          int selected_col;
+          selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+          selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+          selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+          selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+          
+          data[i] = src[selected_row * (src_step>>4) + selected_col];
+   }
+    
+#endif
+    float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+       
+        float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {    
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+        }
+        
+    }
+}
--- a/modules/ocl/src/kernels/filtering_dilateFilter.cl
+++ b/modules/ocl/src/kernels/filtering_dilateFilter.cl
@ -0,0 +1,192 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL FP_CONTRACT ON
+#define UCHAR_MIN 0
+__kernel void dilate_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = get_global_id(0);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    float4 maxVal = (float4)(-FLT_MAX);
+	  int k=0;
+	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX; j++, kX++)
+        {
+			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
+			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+			float4 v = src[current_addr];		
+			uchar now = mat_kernel[k++];
+		    float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(-FLT_MAX);
+            maxVal = max(maxVal , flag);
+        }
+    }
+
+	  if(mX < cols && mY < rows)
+        dst[mY * dstStep + mX + dstOffset] = (maxVal);		   
+}
+
+__kernel void dilate_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = (get_global_id(0)<<2) - (dstOffset&3);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    float4 maxVal = (float4)(-FLT_MAX);
+	  int k=0;
+	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX;j++, kX++)
+        {
+			int start = mad24(kY,srcStep,kX) + srcOffset;
+			start = ((start < end_addr) && (start > 0)) ? start : 0;
+			int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
+			    float8 sVal = (float8)(src[start>>2], src[start2>>2]);
+			
+		    	float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
+		    	int det = start & 3;
+	    		float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);		
+			    uchar now = mat_kernel[k++];
+			    float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
+			    flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : -FLT_MAX;
+			    flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : -FLT_MAX;
+			    flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : -FLT_MAX;
+			    flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : -FLT_MAX;
+			
+          maxVal = max(maxVal , flag);
+        }
+    }
+    if(mY < rows && mX < cols)
+	  {
+		    __global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
+		    float4 dVal = *d;
+    		maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
+    		maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
+    		maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
+    		maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
+		
+        *d = (maxVal);	
+	  }
+}
+
+__kernel void dilate_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = (get_global_id(0)<<2) - (dstOffset&3);;
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    uchar4 maxVal = (uchar4)(UCHAR_MIN);
+	  int k=0;
+	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX;j++, kX++)
+        {
+			    int start = mad24(kY,srcStep,kX) + srcOffset;
+				start = ((start < end_addr) && (start > 0)) ? start : 0;
+				int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
+			    uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
+			
+			    uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
+			    int det = start & 3;
+			    uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
+
+			    uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : maxVal;
+			    flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : UCHAR_MIN;
+			    flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : UCHAR_MIN;
+			    flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : UCHAR_MIN;
+			    flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : UCHAR_MIN;			
+
+          maxVal = max(maxVal , flag);
+        }
+    }
+	  if(mY < rows)
+	  {
+		    __global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
+		    uchar4 dVal = *d;
+		
+    		maxVal.x = (mX >=0 & mX < cols) ? maxVal.x : dVal.x;
+    		maxVal.y = (mX+1 >=0 & mX+1 < cols) ? maxVal.y : dVal.y;
+    		maxVal.z = (mX+2 >=0 & mX+2 < cols) ? maxVal.z : dVal.z;
+    		maxVal.w = (mX+3 >=0 & mX+3 < cols) ? maxVal.w : dVal.w;
+		
+        *d = (maxVal);	
+	  }
+}
+
+__kernel void dilate_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = get_global_id(0);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    uchar4 maxVal = (uchar4)(UCHAR_MIN);
+	  int k=0;
+	  for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX;j++, kX++)
+        {
+			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
+			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;		
+			uchar4 v = src[current_addr];
+			    uchar now = mat_kernel[k++];
+			    uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : maxVal;
+          maxVal = max(maxVal , flag);
+        }
+    }
+
+	  if(mX < cols && mY < rows)
+        dst[mY * dstStep + mX + dstOffset] = (maxVal);		   
+}
+
--- a/modules/ocl/src/kernels/filtering_erodeFilter.cl
+++ b/modules/ocl/src/kernels/filtering_erodeFilter.cl
@ -0,0 +1,183 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin, zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+__kernel void erode_C4_D5(__global const float4 * restrict src, __global float4 *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = get_global_id(0);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    float4 minVal = (float4)(3.4e+38);
+	int k=0;
+	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX; j++, kX++)
+        {
+			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
+			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
+			float4 v = src[current_addr];
+			uchar now = mat_kernel[k++];
+			float4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
+            minVal = min(minVal , flag);
+        }
+    }
+
+	if(mX < cols && mY < rows)
+        dst[mY * dstStep + mX + dstOffset] = (minVal);		   
+}
+
+__kernel void erode_C1_D5(__global float4 * src, __global float *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = (get_global_id(0)<<2) - (dstOffset&3);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    float4 minVal = (float4)(3.4e+38);
+	int k=0;
+	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX;j++, kX++)
+        {
+			int start = mad24(kY,srcStep,kX) + srcOffset;
+			start = ((start < end_addr) && (start > 0)) ? start : 0;
+			int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
+			float8 sVal = (float8)(src[start>>2], src[start2>>2]);
+			
+			float sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
+			int det = start & 3;
+			float4 v=(float4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);		
+			uchar now = mat_kernel[k++];
+			float4 flag = (kY >= minrows & kY <= maxrows & now != 0) ? v : (float4)(3.4e+38);
+			flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 3.4e+38;
+			flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 3.4e+38;
+			flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 3.4e+38;
+			flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 3.4e+38;
+			
+            minVal = min(minVal , flag);
+        }
+    }
+
+	if(mY < rows && mX < cols)
+	{
+		__global float4* d = (__global float4*)(dst + mY * dstStep + mX + dstOffset);
+		float4 dVal = *d;
+		minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
+		minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
+		minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
+		minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
+		
+        *d = (minVal);	
+	}
+}
+
+__kernel void erode_C1_D0(__global const uchar4 * restrict src, __global uchar *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = (get_global_id(0)<<2) - (dstOffset&3);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    uchar4 minVal = (uchar4)(0xff);
+	int k=0;
+	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX;j++, kX++)
+        {
+			int start = mad24(kY,srcStep,kX) + srcOffset;
+			start = ((start < end_addr) && (start > 0)) ? start : 0;
+			int start2 = ((start + 4 < end_addr) && (start > 0)) ? start + 4 : 0;
+			uchar8 sVal = (uchar8)(src[start>>2], src[start2>>2]);
+			
+			uchar sAry[8]= {sVal.s0, sVal.s1, sVal.s2, sVal.s3, sVal.s4, sVal.s5, sVal.s6, sVal.s7};
+			int det = start & 3;
+			uchar4 v=(uchar4)(sAry[det], sAry[det+1], sAry[det+2], sAry[det+3]);
+
+			uchar4 flag = (kY >= minrows & kY <= maxrows & mat_kernel[k++] != 0) ? v : (uchar4)(0xff);
+			flag.x = (kX >= mincols & kX <= maxcols) ? flag.x : 0xff;
+			flag.y = (kX+1 >= mincols & kX+1 <= maxcols) ? flag.y : 0xff;
+			flag.z = (kX+2 >= mincols & kX+2 <= maxcols) ? flag.z : 0xff;
+			flag.w = (kX+3 >= mincols & kX+3 <= maxcols) ? flag.w : 0xff;			
+
+            minVal = min(minVal , flag);
+        }
+    }
+
+	if(mY < rows)
+	{
+		__global uchar4* d = (__global uchar4*)(dst + mY * dstStep + mX + dstOffset);
+		uchar4 dVal = *d;
+		
+		minVal.x = (mX >=0 & mX < cols) ? minVal.x : dVal.x;
+		minVal.y = (mX+1 >=0 & mX+1 < cols) ? minVal.y : dVal.y;
+		minVal.z = (mX+2 >=0 & mX+2 < cols) ? minVal.z : dVal.z;
+		minVal.w = (mX+3 >=0 & mX+3 < cols) ? minVal.w : dVal.w;
+		
+        *d = (minVal);	
+	}
+}
+
+__kernel void erode_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, int srcOffset, int dstOffset, 
+					int mincols, int maxcols, int minrows, int maxrows, int cols, int rows, 
+					int srcStep, int dstStep, __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
+{
+    int mX = get_global_id(0);
+    int mY = get_global_id(1);
+    int kX = mX - anX, kY = mY - anY;
+	int end_addr = mad24(src_whole_rows-1,srcStep,src_whole_cols);
+    uchar4 minVal = (uchar4)(0xff);
+	int k=0;
+	for(int i=0;i<ksY;i++, kY++ , kX = mX - anX)
+    {
+        for(int j=0;j<ksX;j++, kX++)
+        {
+			int current_addr = mad24(kY,srcStep,kX) + srcOffset;
+			current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;		
+			uchar4 v = src[current_addr];
+			uchar now = mat_kernel[k++];
+			uchar4 flag = (kX >= mincols & kX <= maxcols & kY >= minrows & kY <= maxrows & now != 0) ? v : (uchar4)(0xff);
+            minVal = min(minVal , flag);
+        }
+    }
+
+	if(mX < cols && mY < rows)
+        dst[mY * dstStep + mX + dstOffset] = (minVal);		   
+}
+
--- a/modules/ocl/src/kernels/filtering_laplacian.cl
+++ b/modules/ocl/src/kernels/filtering_laplacian.cl
@ -0,0 +1,531 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#define BORDER_REFLECT_101
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////Macro for define elements number per thread/////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define ANCHOR                  3
+#define ANX                     1
+#define ANY                     1
+
+#define ROWS_PER_GROUP          4
+#define ROWS_PER_GROUP_BITS     2
+#define ROWS_FETCH              (ROWS_PER_GROUP + ANY + ANY)   //(ROWS_PER_GROUP + anY * 2)
+
+#define THREADS_PER_ROW         64 
+#define THREADS_PER_ROW_BIT     6 
+
+#define ELEMENTS_PER_THREAD     4 
+#define ELEMENTS_PER_THREAD_BIT 2
+
+#define LOCAL_MEM_STEP          260 //divup((get_local_size(0) + anX * 2), 4) * 4
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y, 
+                             __global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+{
+    int gX = get_global_id(0);
+    int gY = get_global_id(1);
+
+    int lX = get_local_id(0);
+
+    int groupX_size = get_local_size(0);
+    int groupX_id   = get_group_id(0);
+
+    #define dst_align (dst_offset_x & 3)     
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; 
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; 
+        
+    __local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
+    if((gY << 2) < rows)
+    {
+        for(int i = 0; i < ROWS_FETCH; ++i)
+        {
+            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            {
+                #ifdef BORDER_CONSTANT
+                int selected_row  = rows_start_index + i;
+                int selected_cols = cols_start_index_group + lX;
+
+                uchar data = *(src + selected_row * src_step + selected_cols);
+                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                data = con ? data : 0;
+                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+
+                    data = *(src + selected_row * src_step + selected_cols);
+                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                    data = con ? data : 0;
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #else
+                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
+                selected_row     = ADDR_B(rows_start_index + i, wholerows, selected_row);
+
+                int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
+                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
+
+                uchar data = *(src + selected_row * src_step + selected_cols);
+
+                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
+
+                    data = *(src + selected_row * src_step + selected_cols);
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #endif
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
+    if(((gY << 2) < rows) && (process_col < operate_cols))
+    {
+        int dst_cols_start = dst_offset_x; 
+        int dst_cols_end   = dst_offset_x + cols;
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;  
+
+        int dst_rows_end   = dst_offset_y + rows;
+        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+
+        uchar4 dst_data = *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index));
+
+        int4 sum = (int4)(0);
+        uchar4 data;
+
+        for(int i = 0; i < ANCHOR; i++)
+        {
+           #pragma unroll 3
+           for(int j = 0; j < ANCHOR; j++)
+           {
+                if(dst_rows_index < dst_rows_end)
+                {
+                     int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
+                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; 
+
+                     data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); 
+                     sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
+                 }
+            }
+        }
+
+        if(dst_rows_index < dst_rows_end)
+        {
+            sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
+            sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
+            sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
+            sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
+            *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); 
+        }
+   }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y, 
+                             __global float *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+{
+    int gX = get_global_id(0);
+    int gY = get_global_id(1);
+
+    int lX = get_local_id(0);
+
+    int groupX_size = get_local_size(0);
+    int groupX_id   = get_group_id(0);
+
+    #define dst_align (dst_offset_x & 3)     
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; 
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; 
+        
+    __local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
+    if(((gY << 2) < rows))
+    {
+        for(int i = 0; i < ROWS_FETCH; ++i)
+        {
+            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            {
+                #ifdef BORDER_CONSTANT
+                int selected_row  = rows_start_index + i;
+                int selected_cols = cols_start_index_group + lX;
+
+                float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
+                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                data = con ? data : 0;
+                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+
+                    data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
+                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                    data = con ? data : 0;
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #else
+                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
+                selected_row     = ADDR_B(rows_start_index + i, wholerows, selected_row);
+
+                int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
+                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
+
+                float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
+                local_data[i * LOCAL_MEM_STEP + lX] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
+
+                    data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #endif
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
+    if(((gY << 2) < rows) && (process_col < operate_cols))
+    {
+        int dst_cols_start = dst_offset_x; 
+        int dst_cols_end   = dst_offset_x + cols;
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;  
+
+        int dst_rows_end   = dst_offset_y + rows;
+        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+
+        float4 dst_data = *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
+
+        float4 sum = (float4)(0);
+        float4 data;
+
+        for(int i = 0; i < ANCHOR; i++)
+        {
+           #pragma unroll 3
+           for(int j = 0; j < ANCHOR; j++)
+           {
+                if(dst_rows_index < dst_rows_end)
+                {
+                     int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
+                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; 
+
+                     data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); 
+                     sum = sum + (mat_kernel[i * ANCHOR + j] * data);
+                 }
+            }
+        }
+
+        if(dst_rows_index < dst_rows_end)
+        {
+            sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
+            sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
+            sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
+            sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
+
+            *((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; 
+        }
+   }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y, 
+                             __global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+{
+    int gX = get_global_id(0);
+    int gY = get_global_id(1);
+
+    int lX = get_local_id(0);
+
+    int groupX_size = get_local_size(0);
+    int groupX_id   = get_group_id(0);
+
+    #define dst_align (dst_offset_x & 3)     
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; 
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; 
+        
+    __local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
+        
+    if(((gY << 2) < rows))
+    {
+        for(int i = 0; i < ROWS_FETCH; ++i)
+        {
+            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            {
+                #ifdef BORDER_CONSTANT
+                int selected_row  = rows_start_index + i;
+                int selected_cols = cols_start_index_group + lX;
+
+                uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
+                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                data = con ? data : 0;
+                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+
+                    data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
+                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                    data = con ? data : 0;
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #else
+                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
+                selected_row     = ADDR_B(rows_start_index + i, wholerows, selected_row);
+
+                int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
+                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
+
+                uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
+
+                local_data[i * LOCAL_MEM_STEP + lX] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
+
+                    data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #endif
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
+    if(((gY << 2) < rows) && (process_col < operate_cols))
+    {
+        int dst_cols_start = dst_offset_x; 
+        int dst_cols_end   = dst_offset_x + cols;
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;  
+
+        int dst_rows_end   = dst_offset_y + rows;
+        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+
+        uchar16 dst_data;
+        dst_data = *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
+
+        int16 sum = (int16)(0);
+        uchar16 data;
+
+        for(int i = 0; i < ANCHOR; i++)
+        {
+           #pragma unroll 3
+           for(int j = 0; j < ANCHOR; j++)
+           {
+                if(dst_rows_index < dst_rows_end)
+                {
+                     int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
+                     int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; 
+
+                     data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); 
+                     sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
+                 }
+            }
+        }
+
+        if(dst_rows_index < dst_rows_end)
+        {
+            uchar16 sum1 = convert_uchar16_sat(sum);
+            sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?  
+                         sum1.s0123 : dst_data.s0123;
+            sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))? 
+                         sum1.s4567 : dst_data.s4567;
+            sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))? 
+                         sum1.s89ab : dst_data.s89ab;
+            sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))? 
+                         sum1.scdef : dst_data.scdef;
+
+            *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1; 
+        }
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32FC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define ROWS_FETCH_C4              (1 + ANY + ANY)   //(ROWS_PER_GROUP + anY * 2)
+#define LOCAL_MEM_STEP_C4           260 //divup((get_local_size(0) + anX * 2), 4) * 4)
+__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y, 
+                             __global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, 
+                             __constant int *mat_kernel __attribute__((max_constant_size (16384))),
+                             int cols,int rows, int operate_cols, int wholecols, int wholerows) 
+{
+    int gX = get_global_id(0);
+    int gY = get_global_id(1);
+
+    int lX = get_local_id(0);
+
+    int groupX_size = get_local_size(0);
+    int groupX_id   = get_group_id(0);
+
+    int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX; 
+    int rows_start_index       = src_offset_y + gY - ANY; 
+        
+    __local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
+    if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
+    {
+        for(int i = 0; i < ROWS_FETCH_C4; ++i)
+        {
+            if((rows_start_index - src_offset_y) + i < rows + ANY)  
+            {
+                #ifdef BORDER_CONSTANT
+                int selected_row  = rows_start_index + i;
+                int selected_cols = cols_start_index_group + lX;
+
+                float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
+                int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                data = con ? data : 0;
+                local_data[i * LOCAL_MEM_STEP + lX ] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+
+                    data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
+                    con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
+                    data = con ? data : 0;
+                    local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; 
+                }
+                #else
+                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
+                selected_row     = ADDR_B(rows_start_index + i, wholerows, selected_row);
+
+                int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
+                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
+
+                float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
+                local_data[i * LOCAL_MEM_STEP_C4 + lX] =data; 
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
+
+                    data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
+                    local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; 
+                }
+                #endif
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((gY < rows) && (gX < operate_cols))
+    {
+        int dst_cols_index = dst_offset_x + gX;  
+        int dst_rows_index = dst_offset_y + gY;
+
+        float4 sum = (float4)(0);
+
+        for(int i = 0; i < ANCHOR; i++)
+        {
+           for(int j = 0; j < ANCHOR; j++)
+           {
+               int local_cols = lX + j; 
+               sum = sum + mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols];
+            }
+        }
+
+        *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum; 
+    }
+}
--- a/modules/ocl/src/kernels/haarobjectdetect.cl
+++ b/modules/ocl/src/kernels/haarobjectdetect.cl
@ -0,0 +1,550 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Wang Weiyan, wangweiyanster@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#define CV_HAAR_FEATURE_MAX           3
+
+#define calc_sum(rect,offset)        (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
+#define calc_sum1(rect,offset,i)     (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset])
+
+typedef int   sumtype;
+typedef float sqsumtype;
+
+typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
+{
+	struct __attribute__((aligned (32)))
+	{
+		int p0 __attribute__((aligned (4))); 
+		int p1 __attribute__((aligned (4))); 
+		int p2 __attribute__((aligned (4))); 
+		int p3 __attribute__((aligned (4)));
+		float weight __attribute__((aligned (4)));
+	}
+	rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
+}
+GpuHidHaarFeature;
+
+
+typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
+{
+	int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
+	float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
+	float threshold /*__attribute__((aligned (4)))*/;
+	float alpha[2] __attribute__((aligned (8)));
+	int left __attribute__((aligned (4)));
+	int right __attribute__((aligned (4)));
+}
+GpuHidHaarTreeNode;
+
+
+typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
+{
+	int count __attribute__((aligned (4)));
+	GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+	float* alpha __attribute__((aligned (8)));
+}
+GpuHidHaarClassifier;
+
+
+typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
+{
+	int  count __attribute__((aligned (4)));
+	float threshold __attribute__((aligned (4)));
+	int two_rects __attribute__((aligned (4)));
+	int reserved0 __attribute__((aligned (8)));
+	int reserved1 __attribute__((aligned (8)));
+	int reserved2 __attribute__((aligned (8)));
+	int reserved3 __attribute__((aligned (8)));
+}
+GpuHidHaarStageClassifier;
+
+
+typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
+{
+	int  count __attribute__((aligned (4)));
+	int  is_stump_based __attribute__((aligned (4)));
+	int  has_tilted_features __attribute__((aligned (4)));
+	int  is_tree __attribute__((aligned (4)));
+	int pq0 __attribute__((aligned (4))); 
+	int pq1 __attribute__((aligned (4)));
+	int pq2 __attribute__((aligned (4)));
+	int pq3 __attribute__((aligned (4)));
+	int p0 __attribute__((aligned (4)));
+	int p1 __attribute__((aligned (4))); 
+	int p2 __attribute__((aligned (4))); 
+	int p3 __attribute__((aligned (4)));
+	float inv_window_area __attribute__((aligned (4)));
+}GpuHidHaarClassifierCascade;
+
+
+__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
+										  global GpuHidHaarStageClassifier * stagecascadeptr,
+										  global int4 * info,    
+										  global GpuHidHaarTreeNode * nodeptr,
+										  global const int * restrict sum1, 
+										  global const float * restrict sqsum1, 
+										  global int4 * candidate,
+										  const int pixelstep,
+										  const int loopcount,
+										  const int start_stage, 
+										  const int split_stage,
+										  const int end_stage,
+										  const int startnode,
+										  const int splitnode,
+										  const int4 p, 
+										  const int4 pq, 
+										  const float correction 
+										  //const int width, 
+										  //const int height,
+										  //const int grpnumperline,
+										  //const int totalgrp
+										  )
+{
+	int grpszx = get_local_size(0);
+	int grpszy = get_local_size(1);
+	int grpnumx = get_num_groups(0);
+	int grpidx = get_group_id(0);
+	int lclidx = get_local_id(0);
+	int lclidy = get_local_id(1);
+
+	int lcl_sz = mul24(grpszx,grpszy);
+	int lcl_id = mad24(lclidy,grpszx,lclidx);
+
+	//assume lcl_sz == 256 or 128 or 64
+	//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
+	//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
+	__local int lclshare[1024];
+
+#define OFF 0
+	__local int* lcldata = lclshare + OFF;//for save win data
+	__local int* glboutindex = lcldata + 28*28;//for save global out index 
+	__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
+	__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
+	__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
+	glboutindex[0]=0;
+	int outputoff = mul24(grpidx,256);
+
+	//assume window size is 20X20
+#define WINDOWSIZE 20+1
+	//make sure readwidth is the multiple of 4
+	//ystep =1, from host code
+	int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
+	int readheight = grpszy-1+WINDOWSIZE;
+	int read_horiz_cnt = readwidth >> 2;//each read int4
+	int total_read = mul24(read_horiz_cnt,readheight);
+	int read_loop = (total_read + lcl_sz - 1) >> 6;
+	candidate[outputoff+(lcl_id<<2)] = (int4)0;
+	candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
+	candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
+	candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
+	for(int scalei = 0; scalei <loopcount; scalei++)
+	{
+		int4 scaleinfo1= info[scalei];
+		int width = (scaleinfo1.x & 0xffff0000) >> 16;
+		int height = scaleinfo1.x & 0xffff;
+		int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
+		int totalgrp = scaleinfo1.y & 0xffff;
+		int imgoff = scaleinfo1.z;
+		float factor = as_float(scaleinfo1.w);
+		//int ystep =1;// factor > 2.0 ? 1 : 2;
+
+		__global const int * sum = sum1 + imgoff;
+		__global const float * sqsum = sqsum1 + imgoff;
+		for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
+		{
+			int grpidy = grploop / grpnumperline;
+			int grpidx = grploop - mul24(grpidy, grpnumperline);
+			int x = mad24(grpidx,grpszx,lclidx);
+			int y = mad24(grpidy,grpszy,lclidy);
+			//candidate_result.x = convert_int_rtn(x*factor);
+			//candidate_result.y = convert_int_rtn(y*factor);
+			int grpoffx = x-lclidx;
+			int grpoffy = y-lclidy;
+
+			for(int i=0;i<read_loop;i++)
+			{
+				int pos_id = mad24(i,lcl_sz,lcl_id);
+				pos_id = pos_id < total_read ? pos_id : 0;
+
+				int lcl_y = pos_id / read_horiz_cnt;
+				int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
+
+				int glb_x = grpoffx + (lcl_x<<2);
+				int glb_y = grpoffy + lcl_y;
+
+				int glb_off = mad24(glb_y,pixelstep,glb_x);
+				int4 data = *(__global int4*)&sum[glb_off];
+				int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
+
+				lcldata[lcl_off] = data.x;
+				lcldata[lcl_off+1] = data.y;
+				lcldata[lcl_off+2] = data.z;
+				lcldata[lcl_off+3] = data.w;
+			}
+
+			lcloutindex[lcl_id] = 0;
+			lclcount[0] = 0;
+			int result = 1;
+			int nodecounter= startnode;
+			float mean, variance_norm_factor;
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			int lcl_off = mad24(lclidy,readwidth,lclidx);
+			int4 cascadeinfo1, cascadeinfo2;
+			cascadeinfo1 = p;
+			cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
+
+
+			//if((x < width) && (y < height))
+			{
+				cascadeinfo1.x +=lcl_off;
+				cascadeinfo1.z +=lcl_off;
+				mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - 
+					lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
+					*correction;
+
+				int p_offset = mad24(y, pixelstep, x);
+
+				cascadeinfo2.x +=p_offset;
+				cascadeinfo2.z +=p_offset;
+				variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -   
+					sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; 
+
+				variance_norm_factor = variance_norm_factor * correction - mean * mean;
+				variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
+				//if( cascade->is_stump_based )              
+				//{
+				for(int stageloop = start_stage; (stageloop < split_stage)  && result; stageloop++ )
+				{
+					float stage_sum = 0.f;
+					int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+					float stagethreshold = as_float(stageinfo.y);
+					for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
+					{
+						__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+
+						int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+						int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+						int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+						float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+						float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+						float nodethreshold  = w.w * variance_norm_factor;
+
+						info1.x +=lcl_off;
+						info1.z +=lcl_off;
+						info2.x +=lcl_off;
+						info2.z +=lcl_off;
+
+						float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - 
+							lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+
+
+						classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - 
+							lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+
+
+						//if((info3.z - info3.x) && (!stageinfo.z))
+						//{
+							info3.x +=lcl_off;
+							info3.z +=lcl_off;
+							classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - 
+								lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+						//}
+						stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+						nodecounter++;
+					}
+
+					result = (stage_sum >= stagethreshold);
+				}
+
+				if(result && (x < width) && (y < height))
+				{
+					int queueindex = atomic_inc(lclcount);
+					lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+					lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+				}
+				barrier(CLK_LOCAL_MEM_FENCE);
+				int queuecount  = lclcount[0];
+				nodecounter = splitnode;
+				for(int stageloop = split_stage; stageloop< end_stage && queuecount>0;stageloop++)
+				{
+					lclcount[0]=0;
+					barrier(CLK_LOCAL_MEM_FENCE);
+
+					int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+					float stagethreshold = as_float(stageinfo.y);
+
+					int perfscale = queuecount > 4 ? 3 : 2;
+					int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
+					int lcl_compute_win = lcl_sz >> perfscale;
+					int lcl_compute_win_id = (lcl_id >>(6-perfscale));
+					int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
+					int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
+					for(int queueloop=0;queueloop<queuecount_loop && lcl_compute_win_id < queuecount;queueloop++)
+					{
+						float stage_sum = 0.f;
+						int temp_coord = lcloutindex[lcl_compute_win_id<<1];
+						float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
+						int queue_pixel = mad24(((temp_coord  & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
+
+						int tempnodecounter = lcl_compute_id;
+						float part_sum = 0.f;
+						for(int lcl_loop=0;lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;lcl_loop++)
+						{
+							__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
+
+							int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+							int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+							int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+							float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+							float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+							float nodethreshold  = w.w * variance_norm_factor;
+
+							info1.x +=queue_pixel;
+							info1.z +=queue_pixel;
+							info2.x +=queue_pixel;
+							info2.z +=queue_pixel;
+
+							float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - 
+								lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+
+
+							classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - 
+								lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+						//if((info3.z - info3.x) && (!stageinfo.z))
+						//{
+								info3.x +=queue_pixel;
+								info3.z +=queue_pixel;
+								classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - 
+									lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+						//}
+							part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+							tempnodecounter+=lcl_compute_win;
+						}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
+						partialsum[lcl_id]=part_sum;
+						barrier(CLK_LOCAL_MEM_FENCE);
+						for(int i=0;i<lcl_compute_win && (lcl_compute_id==0);i++)
+						{
+							stage_sum += partialsum[lcl_id+i];
+						}
+						if(stage_sum >= stagethreshold && (lcl_compute_id==0))
+						{
+							int queueindex = atomic_inc(lclcount);
+							lcloutindex[queueindex<<1] = temp_coord;
+							lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+						}
+						lcl_compute_win_id +=(1<<perfscale);
+						barrier(CLK_LOCAL_MEM_FENCE);
+					}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
+					queuecount = lclcount[0];
+					nodecounter += stageinfo.x;
+				}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
+				if(lcl_id<queuecount)
+				{
+					int temp = lcloutindex[lcl_id<<1];
+					int x = mad24(grpidx,grpszx,temp & 0xffff);
+					int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
+					temp = glboutindex[0];
+					int4 candidate_result;
+					candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
+					candidate_result.x = convert_int_rtn(x*factor);
+					candidate_result.y = convert_int_rtn(y*factor);
+					atomic_inc(glboutindex);
+					candidate[outputoff+temp+lcl_id] = candidate_result;
+				}
+				barrier(CLK_LOCAL_MEM_FENCE);
+			}//end if((x < width) && (y < height))
+		}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
+		//outputoff +=mul24(width,height);
+	}//end for(int scalei = 0; scalei <loopcount; scalei++)
+}
+	
+				
+	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+                /*
+                if(stagecascade->two_rects) 
+                {
+                    #pragma unroll
+                    for( n = 0; n < stagecascade->count; n++ )
+                    {
+                        t1 = *(node + counter);
+                        t = t1.threshold * variance_norm_factor;
+                        classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
+                       
+                        classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
+                        stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
+                       
+                        counter++;
+                    }
+                }
+                else
+                {
+                    #pragma unroll
+                    for( n = 0; n < stagecascade->count; n++ )
+                    {
+                        t = node[counter].threshold*variance_norm_factor;
+                        classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
+                        classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
+                       
+                        if( node[counter].p0[2] )
+                            classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
+                         
+                        stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
+                       
+                        counter++;
+                    }
+                }
+                */
+				/*
+__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
+										  constant GpuHidHaarClassifierCascade * _cascade, 
+										  global GpuHidHaarStageClassifier * stagecascadeptr,
+                                          //global GpuHidHaarClassifier * classifierptr,    
+										  global GpuHidHaarTreeNode * nodeptr,
+                                          global int * sum, 
+										  global float * sqsum, 
+										  global int * _candidate,
+                                          int pixel_step,
+										  int cols,
+										  int rows,
+										  int start_stage, 
+										  int end_stage,
+                                          //int counts,
+										  int nodenum, 
+										  int ystep, 
+										  int detect_width, 
+										  //int detect_height,
+										  int loopcount,
+										  int outputstep)
+										  //float scalefactor)
+{
+	unsigned int x1 = get_global_id(0);
+	unsigned int y1 = get_global_id(1);
+	int p_offset;
+	int m, n;
+	int result;
+	int counter;
+	float mean, variance_norm_factor;
+	for(int i=0;i<loopcount;i++)
+	{
+		constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
+		global int * candidate = _candidate + i*outputstep;
+		int window_width = cascade->p1 - cascade->p0;
+		int window_height = window_width;
+		result = 1;
+		counter = 0;
+			unsigned int x = mul24(x1,ystep);
+			unsigned int y = mul24(y1,ystep);
+		if((x < cols - window_width - 1) && (y < rows - window_height -1))
+		{
+			global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
+			//global GpuHidHaarClassifier      *classifier   = classifierptr;
+			global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;
+
+			p_offset = mad24(y, pixel_step, x);// modify
+
+			mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) - 
+					*(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
+					*cascade->inv_window_area;
+
+			variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
+									*(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
+			variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
+			variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
+
+			// if( cascade->is_stump_based )              
+			//{
+            for( m = start_stage; m < end_stage; m++ )
+            {
+                float stage_sum = 0.f;
+                float t,  classsum;
+                GpuHidHaarTreeNode t1;
+
+                //#pragma unroll
+                for( n = 0; n < stagecascade->count; n++ )
+                {
+                     t1 = *(node + counter);
+                     t  = t1.threshold * variance_norm_factor;
+                     classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
+
+                     if((t1.p0[2]) && (!stagecascade->two_rects))
+                         classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
+
+                     stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
+                     counter++;
+                }
+                      
+                if (stage_sum < stagecascade->threshold)
+                {
+					result = 0;
+                    break;   
+                }
+
+                stagecascade++;
+
+            }
+				if(result)
+				{
+					candidate[4 * (y1 * detect_width + x1)]     = x;
+					candidate[4 * (y1 * detect_width + x1) + 1] = y;
+					candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
+					candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
+				}
+			//}
+		}
+	}
+}
+*/
+
+				
+				
+	
--- a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
@ -0,0 +1,334 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wu Xinglong, wxl370@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// Enter your kernel in this window
+#pragma OPENCL EXTENSION cl_amd_printf:enable
+#define CV_HAAR_FEATURE_MAX           3
+typedef int   sumtype;
+typedef float sqsumtype;
+typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
+{
+	struct __attribute__((aligned (32)))
+	{
+		int p0 __attribute__((aligned (4))); 
+		int p1 __attribute__((aligned (4))); 
+		int p2 __attribute__((aligned (4))); 
+		int p3 __attribute__((aligned (4)));
+		float weight __attribute__((aligned (4)));
+	}
+	rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
+}
+GpuHidHaarFeature;
+typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
+{
+	int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
+	float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
+	float threshold /*__attribute__((aligned (4)))*/;
+	float alpha[2] __attribute__((aligned (8)));
+	int left __attribute__((aligned (4)));
+	int right __attribute__((aligned (4)));
+}
+GpuHidHaarTreeNode;
+typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
+{
+	int count __attribute__((aligned (4)));
+	GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+	float* alpha __attribute__((aligned (8)));
+}
+GpuHidHaarClassifier;
+typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
+{
+	int  count __attribute__((aligned (4)));
+	float threshold __attribute__((aligned (4)));
+	int two_rects __attribute__((aligned (4)));
+	int reserved0 __attribute__((aligned (8)));
+	int reserved1 __attribute__((aligned (8)));
+	int reserved2 __attribute__((aligned (8)));
+	int reserved3 __attribute__((aligned (8)));
+}
+GpuHidHaarStageClassifier;
+typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
+{
+	int  count __attribute__((aligned (4)));
+	int  is_stump_based __attribute__((aligned (4)));
+	int  has_tilted_features __attribute__((aligned (4)));
+	int  is_tree __attribute__((aligned (4)));
+	int pq0 __attribute__((aligned (4))); 
+	int pq1 __attribute__((aligned (4)));
+	int pq2 __attribute__((aligned (4)));
+	int pq3 __attribute__((aligned (4)));
+	int p0 __attribute__((aligned (4)));
+	int p1 __attribute__((aligned (4))); 
+	int p2 __attribute__((aligned (4))); 
+	int p3 __attribute__((aligned (4)));
+	float inv_window_area __attribute__((aligned (4)));
+}GpuHidHaarClassifierCascade;
+ 
+__kernel void gpuRunHaarClassifierCascade_scaled2(
+	global GpuHidHaarStageClassifier * stagecascadeptr,
+	global int4 * info, 
+	global GpuHidHaarTreeNode * nodeptr,
+	global const int * restrict sum,
+	global const float *  restrict sqsum,
+	global int4 * candidate,
+	const int step,
+	const int loopcount,
+	const int start_stage,
+    const int split_stage,
+	const int end_stage,
+	const int startnode,
+    const int splitnode,
+    global int4 * p,
+									  //const int4 * pq,
+	global float * correction,
+   const int nodecount)
+{
+	int grpszx = get_local_size(0);
+	int grpszy = get_local_size(1);
+	int grpnumx = get_num_groups(0);
+    int grpidx=get_group_id(0);
+	int lclidx = get_local_id(0);
+	int lclidy = get_local_id(1);
+	int lcl_sz = mul24(grpszx,grpszy);
+	int lcl_id = mad24(lclidy,grpszx,lclidx);
+    __local int lclshare[1024];
+    __local int* glboutindex=lclshare+0;
+    __local int* lclcount=glboutindex+1;
+    __local int* lcloutindex=lclcount+1;
+    __local float* partialsum=(__local float*)(lcloutindex+(lcl_sz<<1));
+    glboutindex[0]=0;
+    int outputoff = mul24(grpidx,256);
+	candidate[outputoff+(lcl_id<<2)] = (int4)0;
+	candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
+	candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
+	candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
+	for(int scalei = 0; scalei <loopcount; scalei++)
+	{
+		int4 scaleinfo1;
+		scaleinfo1 = info[scalei];
+		int width = (scaleinfo1.x & 0xffff0000) >> 16;
+		int height = scaleinfo1.x & 0xffff;
+		int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
+		int totalgrp = scaleinfo1.y & 0xffff;
+		float factor = as_float(scaleinfo1.w);
+		float correction_t=correction[scalei];
+		int ystep=(int)(max(2.0f,factor)+0.5f);
+		for(int grploop=get_group_id(0);grploop<totalgrp;grploop+=grpnumx){
+		    int4 cascadeinfo=p[scalei];
+			int grpidy = grploop / grpnumperline;
+			int grpidx = grploop - mul24(grpidy, grpnumperline);
+			int ix = mad24(grpidx,grpszx,lclidx);
+			int iy = mad24(grpidy,grpszy,lclidy);
+            int x=ix*ystep;
+            int y=iy*ystep;
+            lcloutindex[lcl_id]=0;
+            lclcount[0]=0;
+		    int result=1,nodecounter;
+		    float mean,variance_norm_factor;
+			//if((ix < width) && (iy < height))
+            {
+				const int p_offset = mad24(y, step, x);
+				cascadeinfo.x +=p_offset;
+				cascadeinfo.z +=p_offset;
+				mean = (sum[mad24(cascadeinfo.y,step,cascadeinfo.x)] - sum[mad24(cascadeinfo.y,step,cascadeinfo.z)] - 
+					sum[mad24(cascadeinfo.w,step,cascadeinfo.x)] + sum[mad24(cascadeinfo.w,step,cascadeinfo.z)])
+					*correction_t;
+				variance_norm_factor =sqsum[mad24(cascadeinfo.y,step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -   
+				sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)]; 
+				variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
+				variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
+				result = 1;
+				nodecounter = startnode+nodecount*scalei;
+				for(int stageloop = start_stage; stageloop < split_stage&&result; stageloop++ )
+				{
+					float stage_sum = 0.f;
+					int4 stageinfo = *(global int4*)(stagecascadeptr+stageloop);
+					float stagethreshold = as_float(stageinfo.y);
+					for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
+					{
+						__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+						int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+						int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+						int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+						float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+						float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                       float nodethreshold  = w.w * variance_norm_factor;
+						info1.x +=p_offset;
+						info1.z +=p_offset;
+						info2.x +=p_offset;
+						info2.z +=p_offset;
+						float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] - 
+							sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
+						classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] - 
+							sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
+						info3.x +=p_offset;
+						info3.z +=p_offset;
+						 classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] - 
+							sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
+						stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+						nodecounter++;
+					}
+					result=(stage_sum>=stagethreshold);
+				}
+				if(result&&(ix<width)&&(iy<height))
+                {
+                     int queueindex=atomic_inc(lclcount);
+                     lcloutindex[queueindex<<1]=(y<<16)|x;
+                     lcloutindex[(queueindex<<1)+1]=as_int(variance_norm_factor);
+                }
+                barrier(CLK_LOCAL_MEM_FENCE); 
+                int queuecount=lclcount[0];
+                nodecounter=splitnode+nodecount*scalei;
+                for(int stageloop=split_stage;stageloop<end_stage&&queuecount>0;stageloop++)
+                {
+                     lclcount[0]=0;
+                     barrier(CLK_LOCAL_MEM_FENCE);
+                     int2 stageinfo=*(global int2*)(stagecascadeptr+stageloop);
+                     float stagethreshold=as_float(stageinfo.y);
+                     int perfscale=queuecount>4?3:2;
+                     int queuecount_loop=(queuecount+(1<<perfscale)-1)>>perfscale;
+                     int lcl_compute_win=lcl_sz>>perfscale;
+                     int lcl_compute_win_id=(lcl_id>>(6-perfscale));
+                     int lcl_loops=(stageinfo.x+lcl_compute_win-1)>>(6-perfscale);
+                     int lcl_compute_id=lcl_id-(lcl_compute_win_id<<(6-perfscale));
+                     for(int queueloop=0;queueloop<queuecount_loop&&lcl_compute_win_id<queuecount;queueloop++)
+                     {
+                          float stage_sum=0.f;
+                          int temp_coord=lcloutindex[lcl_compute_win_id<<1];
+                          float variance_norm_factor=as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
+                          int queue_offset=mad24(((temp_coord&(int)0xffff0000)>>16),step,temp_coord&0xffff);
+                          int tempnodecounter=lcl_compute_id;
+                          float part_sum=0.f;
+                          for(int lcl_loop=0;lcl_loop<lcl_loops&&tempnodecounter<stageinfo.x;lcl_loop++)
+                          {
+                              __global  GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
+							  int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+							  int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+							  int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+							  float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+							  float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+							  float nodethreshold  = w.w * variance_norm_factor;
+							  info1.x +=queue_offset;
+							  info1.z +=queue_offset;
+							  info2.x +=queue_offset;
+							  info2.z +=queue_offset;
+							  float classsum = (sum[mad24(info1.y,step,info1.x)] - sum[mad24(info1.y,step,info1.z)] - 
+								sum[mad24(info1.w,step,info1.x)] + sum[mad24(info1.w,step,info1.z)]) * w.x;
+							  classsum += (sum[mad24(info2.y,step,info2.x)] - sum[mad24(info2.y,step,info2.z)] - 
+							  sum[mad24(info2.w,step,info2.x)] + sum[mad24(info2.w,step,info2.z)]) * w.y;
+			
+							  info3.x +=queue_offset;
+							  info3.z +=queue_offset;
+							  classsum += (sum[mad24(info3.y,step,info3.x)] - sum[mad24(info3.y,step,info3.z)] - 
+									sum[mad24(info3.w,step,info3.x)] + sum[mad24(info3.w,step,info3.z)]) * w.z;
+							  part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+							  tempnodecounter+=lcl_compute_win;
+                         }
+                         partialsum[lcl_id]=part_sum;
+                         barrier(CLK_LOCAL_MEM_FENCE);  
+                         for(int i=0;i<lcl_compute_win&&(lcl_compute_id==0);i++)
+                         {
+                              stage_sum+=partialsum[lcl_id+i];
+                         } 
+                         if(stage_sum>=stagethreshold&&(lcl_compute_id==0))
+                         {
+                              int queueindex=atomic_inc(lclcount);
+                              lcloutindex[queueindex<<1]=temp_coord;
+                              lcloutindex[(queueindex<<1)+1]=as_int(variance_norm_factor);
+                         }
+                         lcl_compute_win_id+=(1<<perfscale);
+                         barrier(CLK_LOCAL_MEM_FENCE);
+                     }
+                     queuecount=lclcount[0];
+                     nodecounter+=stageinfo.x;
+                 }
+                 if(lcl_id<queuecount)
+                 {
+                     int temp=lcloutindex[lcl_id<<1];
+                     int x=temp&0xffff;
+                     int y=(temp&(int)0xffff0000)>>16;
+                     temp=glboutindex[0];
+                     int4 candidate_result;
+                     candidate_result.zw=(int2)convert_int_rtn(factor*20.f);
+                     candidate_result.x=x;
+                     candidate_result.y=y;
+                     atomic_inc(glboutindex);
+                     candidate[outputoff+temp+lcl_id]=candidate_result;
+                 }
+                 barrier(CLK_LOCAL_MEM_FENCE);
+			}
+		}
+   }
+}
+__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode * orinode, global GpuHidHaarTreeNode * newnode,float scale,float weight_scale,int nodenum)
+{
+    int counter=get_global_id(0);
+    int tr_x[3],tr_y[3],tr_h[3],tr_w[3],i=0;
+    GpuHidHaarTreeNode t1 = *(orinode + counter);
+    #pragma unroll
+    for(i=0;i<3;i++){
+       tr_x[i]=(int)(t1.p[i][0]*scale+0.5f);
+       tr_y[i]=(int)(t1.p[i][1]*scale+0.5f);
+       tr_w[i]=(int)(t1.p[i][2]*scale+0.5f);
+       tr_h[i]=(int)(t1.p[i][3]*scale+0.5f);
+    }
+    t1.weight[0]=t1.p[2][0]?-(t1.weight[1]*tr_h[1]*tr_w[1]+t1.weight[2]*tr_h[2]*tr_w[2])/(tr_h[0]*tr_w[0]):-t1.weight[1]*tr_h[1]*tr_w[1]/(tr_h[0]*tr_w[0]);
+   counter+=nodenum;
+   #pragma unroll
+   for(i=0;i<3;i++)
+   {
+         newnode[counter].p[i][0]=tr_x[i];
+         newnode[counter].p[i][1]=tr_y[i];
+         newnode[counter].p[i][2]=tr_x[i]+tr_w[i];
+         newnode[counter].p[i][3]=tr_y[i]+tr_h[i];
+         newnode[counter].weight[i]=t1.weight[i]*weight_scale;
+   }
+   newnode[counter].left=t1.left;
+   newnode[counter].right=t1.right;
+   newnode[counter].threshold=t1.threshold;
+   newnode[counter].alpha[0]=t1.alpha[0];
+   newnode[counter].alpha[1]=t1.alpha[1];
+}
+
--- a/modules/ocl/src/kernels/img_proc.cl
+++ b/modules/ocl/src/kernels/img_proc.cl
--- a/modules/ocl/src/kernels/imgproc_bilateral.cl
+++ b/modules/ocl/src/kernels/imgproc_bilateral.cl
@ -0,0 +1,178 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Rock Li, Rock.li@amd.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+//#pragma OPENCL EXTENSION cl_amd_printf :enable
+__kernel
+void bilateral4(__global uchar4 *dst,
+		__global uchar4 *src,
+		int rows,
+		int cols,
+		int channels,
+		int radius,
+		int wholerows,
+		int wholecols,
+		int src_step,
+		int dst_step,
+		int src_offset,
+		int dst_offset,
+		__constant float *sigClr,
+		__constant float *sigSpc)
+{
+	uint lidx = get_local_id(0);
+	uint lidy = get_local_id(1);
+	
+	uint gdx = get_global_id(0);
+	uint gdy = get_global_id(1);
+
+	uint gidx = gdx >=cols?cols-1:gdx;
+	uint gidy = gdy >=rows?rows-1:gdy;
+
+	uchar4 p,q,tmp;
+
+	float4 pf = 0,pq = 0,pd = 0;
+        float wt =0;
+
+	int r = radius;
+	int ij = 0;
+	int ct = 0;
+
+	uint index_src = src_offset/4 + gidy*src_step/4 + gidx;
+	uint index_dst = dst_offset/4 + gidy*dst_step/4 + gidx;
+
+	p = src[index_src];
+
+	uint gx,gy;
+	uint src_index,dst_index;
+
+	for(int ii = -r;ii<r+1;ii++)
+	{
+		for(int jj =-r;jj<r+1;jj++)
+			{
+					ij = ii*ii+jj*jj;
+					if(ij > mul24(radius,radius)) continue;
+					gx = gidx + jj;
+					gy = gidy + ii;
+
+					src_index = src_offset/4 + gy *	 src_step/4 + gx;
+					q = src[src_index];
+					
+
+					ct = abs(p.x-q.x)+abs(p.y-q.y)+abs(p.z-q.z);
+					wt =sigClr[ct]*sigSpc[(ii+radius)*(2*radius+1)+jj+radius];
+
+				        pf.x += q.x*wt;
+					pf.y += q.y*wt;
+					pf.z += q.z*wt;
+//					pf.w += q.w*wt;
+
+					pq += wt;
+
+			}
+	}
+
+	pd = pf/pq;
+	dst[index_dst] = convert_uchar4_rte(pd);
+}
+
+__kernel
+void bilateral(__global uchar *dst,
+		__global uchar *src,
+		int rows,
+		int cols,
+		int channels,
+		int radius,
+		int wholerows,
+		int wholecols,
+		int src_step,
+		int dst_step,
+		int src_offset,
+		int dst_offset,
+		__constant float *sigClr,
+		__constant float *sigSpc)
+{
+	uint lidx = get_local_id(0);
+	uint lidy = get_local_id(1);
+	
+	uint gdx = get_global_id(0);
+	uint gdy = get_global_id(1);
+
+	uint gidx = gdx >=cols?cols-1:gdx;
+	uint gidy = gdy >=rows?rows-1:gdy;
+
+	uchar p,q,tmp;
+
+	float pf = 0,pq = 0,wt = 0,pd = 0;
+
+	int r =radius;
+	int ij = 0;
+	int ct = 0;
+
+	uint index_src = src_offset + gidy*src_step + gidx;
+	uint index_dst = dst_offset + gidy*dst_step + gidx;
+
+	p = src[index_src];
+
+	uint gx,gy;
+	uint src_index,dst_index;
+
+	for(int ii = -r;ii<r+1;ii++)
+	{
+		for(int jj =-r;jj<r+1;jj++)
+			{
+					ij = ii*ii+jj*jj;
+					if(ij > mul24(radius,radius)) continue;
+
+					gx = gidx + jj;
+					gy = gidy + ii;
+
+					
+					src_index = src_offset + gy * src_step + gx;
+					q = src[src_index];
+
+					ct = abs(p-q);
+					wt =sigClr[ct]*sigSpc[(ii+radius)*(2*radius+1)+jj+radius];
+
+					pf += q*wt;
+					
+					pq += wt;
+			}
+	}
+	pd = pf/pq;
+	dst[index_dst] = convert_uchar_rte(pd);
+
+}
+
--- a/modules/ocl/src/kernels/imgproc_calcHarris.cl
+++ b/modules/ocl/src/kernels/imgproc_calcHarris.cl
@ -0,0 +1,199 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////calcHarris////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
+                              int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
+                              int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              float k)
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int dx_x_off = (dx_offset % dx_step) >> 2;
+    int dx_y_off = dx_offset / dx_step;
+    int dy_x_off = (dy_offset % dy_step) >> 2;
+    int dy_y_off = dy_offset / dy_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
+    int dx_startY = (gY << 1) - anY + dx_y_off;
+    int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
+    int dy_startY = (gY << 1) - anY + dy_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+    __local float temp[6][THREADS];
+#ifdef BORDER_CONSTANT
+    bool dx_con,dy_con;
+    float dx_s,dy_s;
+    for(int i=0; i < ksY+1; i++)
+    {
+        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+dx_col)]; 
+        dx_data[i] = dx_con ? dx_s : 0.0;
+        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+dy_col)]; 
+        dy_data[i] = dy_con ? dy_s : 0.0;
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#else
+   for(int i=0; i < ksY+1; i++)
+   {
+        int dx_selected_row;
+        int dx_selected_col;
+        dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
+        dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
+        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
+        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
+        
+        int dy_selected_row;
+        int dy_selected_col;
+        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
+        dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
+        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
+        dy_data[i] = Dy[dx_selected_row * (dy_step>>2) + dy_selected_col];
+       
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+   }
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[0][i]);
+        sum1 += (data[1][i]);
+        sum2 += (data[2][i]);
+    }
+    float sum01,sum02,sum11,sum12,sum21,sum22;
+    sum01 = sum0 + (data[0][0]);
+    sum02 = sum0 + (data[0][ksY]);
+    temp[0][col] = sum01;
+    temp[1][col] = sum02;
+    sum11 = sum1 + (data[1][0]);
+    sum12 = sum1 + (data[1][ksY]);
+    temp[2][col] = sum11;
+    temp[3][col] = sum12;
+    sum21 = sum2 + (data[2][0]);
+    sum22 = sum2 + (data[2][ksY]);
+    temp[4][col] = sum21;
+    temp[5][col] = sum22;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+        int till = (ksX + 1)%2;
+        float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
+        for(int k=0; k<6; k++)
+            for(int i=-anX; i<=anX - till; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+
+        if(posX < dst_cols && (posY) < dst_rows)
+        {
+            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = 
+                    tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
+        }
+        if(posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = 
+                    tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
+        }
+    }
+}
--- a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl
+++ b/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl
@ -0,0 +1,203 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i)) 
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr)) 
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////calcHarris////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
+                              int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
+                              int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              float k)
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int dx_x_off = (dx_offset % dx_step) >> 2;
+    int dx_y_off = dx_offset / dx_step;
+    int dy_x_off = (dy_offset % dy_step) >> 2;
+    int dy_y_off = dy_offset / dy_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
+    int dx_startY = (gY << 1) - anY + dx_y_off;
+    int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
+    int dy_startY = (gY << 1) - anY + dy_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+    __local float temp[6][THREADS];
+#ifdef BORDER_CONSTANT
+    bool dx_con,dy_con;
+    float dx_s,dy_s;
+    for(int i=0; i < ksY+1; i++)
+    {
+        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+dx_col)]; 
+        dx_data[i] = dx_con ? dx_s : 0.0;
+        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+dy_col)]; 
+        dy_data[i] = dy_con ? dy_s : 0.0;
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#else
+   for(int i=0; i < ksY+1; i++)
+   {
+        int dx_selected_row;
+        int dx_selected_col;
+        dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
+        dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
+        dx_selected_col = ADDR_L(dx_startX+col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+col, dx_whole_cols, dx_selected_col);
+        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
+        
+        int dy_selected_row;
+        int dy_selected_col;
+        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
+        dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
+        dy_selected_col = ADDR_L(dy_startX+col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+col, dy_whole_cols, dy_selected_col);
+        dy_data[i] = Dy[dx_selected_row * (dy_step>>2) + dy_selected_col];
+       
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+   }
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[0][i]);
+        sum1 += (data[1][i]);
+        sum2 += (data[2][i]);
+    }
+    float sum01,sum02,sum11,sum12,sum21,sum22;
+    sum01 = sum0 + (data[0][0]);
+    sum02 = sum0 + (data[0][ksY]);
+    temp[0][col] = sum01;
+    temp[1][col] = sum02;
+    sum11 = sum1 + (data[1][0]);
+    sum12 = sum1 + (data[1][ksY]);
+    temp[2][col] = sum11;
+    temp[3][col] = sum12;
+    sum21 = sum2 + (data[2][0]);
+    sum22 = sum2 + (data[2][ksY]);
+    temp[4][col] = sum21;
+    temp[5][col] = sum22;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+        int till = (ksX + 1)%2;
+        float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
+        for(int k=0; k<6; k++)
+            for(int i=-anX; i<=anX - till; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+
+        if(posX < dst_cols && (posY) < dst_rows)
+        {
+            float a = tmp_sum[0] * 0.5f;
+            float b = tmp_sum[2];
+            float c = tmp_sum[4] * 0.5f;
+            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+        if(posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            float a = tmp_sum[1] * 0.5f;
+            float b = tmp_sum[3];
+            float c = tmp_sum[5] * 0.5f;
+            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+    }
+}
--- a/modules/ocl/src/kernels/imgproc_copymakeboder.cl
+++ b/modules/ocl/src/kernels/imgproc_copymakeboder.cl
@ -0,0 +1,246 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+#define get(a,b,c) (( b >= top & b < srcRows+top & a >= left & a < srcCols+left )? c : 8)
+__kernel void copyConstBorder_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, 
+								int srcCols, int srcRows, int dstCols, int dstRows, 
+								int top, int left, uchar nVal, int srcStep, int dstStep)
+{
+	int idx = get_global_id(0);
+	int tpr = (dstCols + 3 + (dstOffset&3))>>2;
+	int dx  = ((idx%(tpr))<<2) - (dstOffset&3);
+    int dy = idx/(tpr);
+    
+	__global uchar4 * d=(__global uchar4 *)(dst + dstOffset + dy*dstStep + dx);
+	int start=srcOffset + (dy-top)*srcStep + (dx-left);
+	uchar8 s=*((__global uchar8 *)(src + ((start>>2)<<2) ));
+	uchar4 v;
+	
+	uchar sv[9]={s.s0,s.s1,s.s2,s.s3,s.s4,s.s5,s.s6,s.s7,nVal};
+	
+	int det=start&3;
+	v.x=sv[get(dx,dy,det)];
+	v.y=sv[get(dx+1,dy,det+1)];
+	v.z=sv[get(dx+2,dy,det+2)];
+	v.w=sv[get(dx+3,dy,det+3)];
+	
+	if(dy<dstRows)
+	{
+		uchar4 res = *d;
+		res.x = (dx>=0 && dx<dstCols) ? v.x : res.x;
+		res.y = (dx+1>=0 && dx+1<dstCols) ? v.y : res.y;
+		res.z = (dx+2>=0 && dx+2<dstCols) ? v.z : res.z;
+		res.w = (dx+3>=0 && dx+3<dstCols) ? v.w : res.w;
+	
+		*d=res;
+	}
+}
+#undef get(a,b,c)
+
+#define get(a,b,c,d) (( b >= top & b < srcRows+top & a >= left & a < srcCols+left )? c : d)
+__kernel void copyConstBorder_C1_D4(__global int * src, __global int * dst, int srcOffset, int dstOffset, 
+								int srcCols, int srcRows, int dstCols, int dstRows, 
+								int top, int left, int nVal, int srcStep, int dstStep)
+{
+    int idx = get_global_id(0);
+	int tpr = (dstCols + 3)>>2;
+	int dx  = (idx%(tpr))<<2;
+    int dy = idx/(tpr);
+    
+	__global int4 * d=(__global int4 *)(dst+dy*dstStep+dx);
+	int4 s=*((__global int4 *)(src + srcOffset + (dy-top)*srcStep + (dx-left) ));
+	int4 v;
+	
+	v.x=get(dx,dy,s.x,nVal);
+	v.y=get(dx+1,dy,s.y,nVal);
+	v.z=get(dx+2,dy,s.z,nVal);
+	v.w=get(dx+3,dy,s.w,nVal);
+	
+	if(dy<dstRows)
+	{
+		int4 res = *d;
+		v.y = (dx+1<dstCols) ? v.y : res.y;
+		v.z = (dx+2<dstCols) ? v.z : res.z;
+		v.w = (dx+3<dstCols) ? v.w : res.w;
+	
+		*d=v;
+	}
+}
+#undef get(a,b,c,d)
+
+#define get(a,b,c) ( a < srcCols+left ? b : c)
+__kernel void copyReplicateBorder_C1_D4(__global int * src, __global int * dst, int srcOffset, int dstOffset, 
+								int srcCols, int srcRows, int dstCols, int dstRows, 
+								int top, int left, int nVal, int srcStep, int dstStep)
+{
+    int idx = get_global_id(0);
+	int tpr = (dstCols + 3)>>2;
+	int dx  = (idx%(tpr))<<2;
+    int dy = idx/(tpr);
+
+	__global int4 * d=(__global int4 *)(dst + dstOffset + dy*dstStep + dx);
+	int c=clamp(dx-left,0,srcCols-1);
+	int4 s=*((__global int4 *)(src + srcOffset + clamp(dy-top,0,srcRows-1) * srcStep + c ));
+	int sa[4]={s.x,s.y,s.z,s.w};
+	int4 v;
+	
+	v.x=get(dx,sa[max(0,(dx-left)-c)],sa[srcCols-1-c]);
+	v.y=get(dx+1,sa[max(0,(dx+1-left)-c)],sa[srcCols-1-c]);
+	v.z=get(dx+2,sa[max(0,(dx+2-left)-c)],sa[srcCols-1-c]);
+	v.w=get(dx+3,sa[max(0,(dx+3-left)-c)],sa[srcCols-1-c]);
+	
+	if(dy<dstRows)
+	{
+		int4 res = *d;
+		v.y = (dx+1<dstCols) ? v.y : res.y;
+		v.z = (dx+2<dstCols) ? v.z : res.z;
+		v.w = (dx+3<dstCols) ? v.w : res.w;
+	
+		*d=v;
+	}
+}
+
+__kernel void copyReplicateBorder_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, 
+								int srcCols, int srcRows, int dstCols, int dstRows, 
+								int top, int left, uchar nVal, int srcStep, int dstStep)
+{
+	int idx = get_global_id(0);
+	int tpr = (dstCols + 3 + (dstOffset&3))>>2;
+	int dx  = ((idx%(tpr))<<2) - (dstOffset&3);
+    int dy = idx/(tpr);
+    
+	__global uchar4 * d=(__global uchar4 *)(dst + dstOffset + dy*dstStep + dx);
+	int c=clamp(dx-left,0,srcCols-1);
+	int start= srcOffset + clamp(dy-top,0,srcRows-1) * srcStep + c;
+	uchar8 s=*((__global uchar8 *)(src + ((start>>2)<<2) ));
+	uchar4 v;
+	
+	uchar sa[8]={s.s0,s.s1,s.s2,s.s3,s.s4,s.s5,s.s6,s.s7};
+	
+	int det=start&3;
+	v.x=get(dx,sa[max(0,(dx-left)-c)+det],sa[srcCols-1-c+det]);
+	v.y=get(dx+1,sa[max(0,(dx+1-left)-c)+det],sa[srcCols-1-c+det]);
+	v.z=get(dx+2,sa[max(0,(dx+2-left)-c)+det],sa[srcCols-1-c+det]);
+	v.w=get(dx+3,sa[max(0,(dx+3-left)-c)+det],sa[srcCols-1-c+det]);
+	
+	if(dy<dstRows)
+	{
+		uchar4 res = *d;
+		res.x = (dx>=0 && dx<dstCols) ? v.x : res.x;
+		res.y = (dx+1>=0 && dx+1<dstCols) ? v.y : res.y;
+		res.z = (dx+2>=0 && dx+2<dstCols) ? v.z : res.z;
+		res.w = (dx+3>=0 && dx+3<dstCols) ? v.w : res.w;
+	
+		*d=res;
+	}
+}
+#undef get(a,b,c)
+
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define edge(x,size,rx) rx = abs(x) % ((size<<1)-2); rx = (rx>=size?(size<<1)-2:rx<<1) - rx;
+__kernel void copyReflectBorder_C1_D4(__global int * src, __global int * dst, int srcOffset, int dstOffset, 
+								int srcCols, int srcRows, int dstCols, int dstRows, 
+								int top, int left, int nVal, int srcStep, int dstStep)
+{
+    int idx = get_global_id(0);
+	int tpr = (dstCols + 3)>>2;
+	int dx  = (idx%(tpr))<<2;
+    int dy = idx/(tpr);
+
+	__global int4 * d=(__global int4 *)(dst + dstOffset + dy*dstStep + dx);
+	uint4 id;
+	edge(dx-left,srcCols,id.x);
+	edge(dx-left+1,srcCols,id.x);
+	edge(dx-left+2,srcCols,id.x);
+	edge(dx-left+3,srcCols,id.x);
+
+
+
+	int start=min(id.x,id.w);
+	int4 s=*((__global int4 *)(src + srcOffset + clamp(dy-top,0,srcRows-1) * srcStep + start));
+	int sa[4]={s.x,s.y,s.z,s.w};
+
+	int4 v=(int4)(sa[(id.x-start)],sa[(id.y-start)],sa[(id.z-start)],sa[(id.w-start)]);
+	
+	
+	if(dy<dstRows)
+	{
+		int4 res = *d;
+		v.y = (dx+1<dstCols) ? v.y : res.y;
+		v.z = (dx+2<dstCols) ? v.z : res.z;
+		v.w = (dx+3<dstCols) ? v.w : res.w;
+	
+		*d=v;
+	}
+}
+
+__kernel void copyReflectBorder_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, 
+								int srcCols, int srcRows, int dstCols, int dstRows, 
+								int top, int left, uchar nVal, int srcStep, int dstStep)
+{
+    int idx = get_global_id(0);
+	int tpr = (dstCols + 3 + (dstOffset&3))>>2;
+	int dx  = ((idx%(tpr))<<2) - (dstOffset&3);
+    int dy = idx/(tpr);
+    
+	__global uchar4 * d=(__global uchar4 *)(dst + dstOffset + dy*dstStep + dx);
+	uint4 id;
+	edge(dx-left,srcCols,id.x);
+	edge(dx-left+1,srcCols,id.x);
+	edge(dx-left+2,srcCols,id.x);
+	edge(dx-left+3,srcCols,id.x);
+
+	int start=min(id.x,id.w) + srcOffset;
+	uchar8 s=*((__global uchar8 *)(src + clamp(dy-top,0,srcRows-1) * srcStep + ((start>>2)<<2) ));
+	uchar sa[8]={s.s0,s.s1,s.s2,s.s3,s.s4,s.s5,s.s6,s.s7};
+	
+	int det=start&3;
+	uchar4 v=(uchar4)(sa[(id.x-start)+det],sa[(id.y-start)+det],sa[(id.z-start)+det],sa[(id.w-start)+det]);
+	
+	if(dy<dstRows)
+	{
+		uchar4 res = *d;
+		res.x = (dx>=0 && dx<dstCols) ? v.x : res.x;
+		res.y = (dx+1>=0 && dx+1<dstCols) ? v.y : res.y;
+		res.z = (dx+2>=0 && dx+2<dstCols) ? v.z : res.z;
+		res.w = (dx+3>=0 && dx+3<dstCols) ? v.w : res.w;
+	
+		*d=res;
+	}
+}
+#undef edge(x,size,rx)
+
--- a/modules/ocl/src/kernels/imgproc_histogram.cl
+++ b/modules/ocl/src/kernels/imgproc_histogram.cl
@ -0,0 +1,222 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+#define PARTITAL_HISTGRAM256_COUNT     (256) 
+#define HISTOGRAM256_BIN_COUNT         (256)
+
+#define HISTGRAM256_WORK_GROUP_SIZE     (256)
+#define HISTGRAM256_LOCAL_MEM_SIZE      (HISTOGRAM256_BIN_COUNT)
+
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void calc_sub_hist_D0(__global const uchar4* src, 
+							   int src_step, 
+							   int src_offset,
+                               __global int*   buf,
+                               int data_count, 
+							   int cols, 
+							   int inc_x, 
+							   int inc_y,
+							   int dst_offset)
+{
+    int x  = get_global_id(0);
+    int lx = get_local_id(0);
+    int gx = get_group_id(0);
+    int total_threads = get_global_size(0);
+    src +=  src_offset;
+    __local int s_hist[HISTGRAM256_LOCAL_MEM_SIZE];
+    s_hist[lx] = 0;
+
+    int pos_y = x / cols;
+    int pos_x = x - mul24(pos_y, cols);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int pos = x; pos < data_count; pos += total_threads)
+    {
+        int4 data = convert_int4(src[mad24(pos_y,src_step,pos_x)]);
+        atomic_inc(s_hist + data.x);
+        atomic_inc(s_hist + data.y);
+        atomic_inc(s_hist + data.z);
+        atomic_inc(s_hist + data.w);
+		
+        pos_x +=inc_x;
+        int off = (pos_x >= cols ? -1 : 0);
+        pos_x =  mad24(off,cols,pos_x);
+        pos_y += inc_y - off;
+		
+        //pos_x = pos_x > cols ? pos_x - cols : pos_x;
+        //pos_y = pos_x > cols ? pos_y + 1 : pos_y;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    buf[ mad24(gx, dst_offset, lx)] = s_hist[lx];
+}
+
+__kernel void __attribute__((reqd_work_group_size(1,256,1)))calc_sub_hist2_D0( __global const uchar* src, 
+				 int src_step, 
+				 int src_offset,
+                 __global int*   buf,
+                 int left_col, 
+				 int cols,
+				 int rows,
+				 int dst_offset)
+{
+	int gidx = get_global_id(0);
+	int gidy = get_global_id(1);
+	int gx = get_group_id(0);
+	int gy = get_group_id(1);
+	int gnum = get_num_groups(0);
+	int output_row = mad24(gy,gnum,gx);
+	//int lidx = get_local_id(0);
+	int lidy = get_local_id(1);
+
+    __local int s_hist[HISTGRAM256_LOCAL_MEM_SIZE+1];
+    s_hist[lidy] = 0;
+	//mem_fence(CLK_LOCAL_MEM_FENCE);
+
+
+	//clamp(gidx,mask,cols-1);
+	gidx = gidx >= left_col ? cols+gidx : gidx;
+	//gidy = gidy >= rows?rows-1:gidy;
+
+	int src_index = src_offset + mad24(gidy,src_step,gidx);	
+	//int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
+	//uchar4 p,q;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int p = (int)src[src_index];
+	p = gidy >= rows ? HISTGRAM256_LOCAL_MEM_SIZE : p;
+	atomic_inc(s_hist + p);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    buf[ mad24(output_row, dst_offset, lidy)] += s_hist[lidy];
+}
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,  
+				__global int* hist,
+				int src_step)
+{
+    int lx = get_local_id(0);
+    int gx = get_group_id(0);
+
+    int sum = 0;
+
+    for(int i = lx; i < PARTITAL_HISTGRAM256_COUNT; i += HISTGRAM256_WORK_GROUP_SIZE)
+        sum += buf[ mad24(i, src_step, gx)];
+
+    __local int data[HISTGRAM256_WORK_GROUP_SIZE];
+    data[lx] = sum;
+
+    for(int stride = HISTGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lx < stride)
+            data[lx] += data[lx + stride];
+    }
+
+    if(lx == 0)
+        hist[gx] = data[0]; 
+}
+
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void calLUT(
+							__global uchar * dst,
+							__constant int * hist,
+							float scale)
+{
+	int lid = get_local_id(0);
+	__local int sumhist[HISTOGRAM256_BIN_COUNT];
+	//__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
+
+	sumhist[lid]=hist[lid];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(lid==0)
+	{
+		int sum = 0;
+		for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
+		{
+			sum+=sumhist[i];
+			sumhist[i]=sum;
+		}
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	dst[lid]= lid == 0 ? 0 : convert_uchar_sat(convert_float(sumhist[lid])*scale);
+}
+/*
+///////////////////////////////equalizeHist//////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
+							__global uchar * src,
+							__global uchar * dst,
+							__constant int * hist,
+							int srcstep,
+							int srcoffset,
+							int dststep,
+							int dstoffset,
+							int width,
+							int height,
+							float scale,
+							int inc_x,
+							int inc_y)
+{
+	int gidx = get_global_id(0);
+	int lid = get_local_id(0);
+	int glb_size = get_global_size(0);
+	src+=srcoffset;
+	dst+=dstoffset;
+	__local int sumhist[HISTOGRAM256_BIN_COUNT];
+	__local uchar lut[HISTOGRAM256_BIN_COUNT+1];
+
+	sumhist[lid]=hist[lid];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(lid==0)
+	{
+		int sum = 0;
+		for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
+		{
+			sum+=sumhist[i];
+			sumhist[i]=sum;
+		}
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
+	lut[0]=0;
+    int pos_y = gidx / width;
+    int pos_x = gidx - mul24(pos_y, width);
+
+    for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
+	{
+		int inaddr = mad24(pos_y,srcstep,pos_x);
+		int outaddr = mad24(pos_y,dststep,pos_x);
+		dst[outaddr] = lut[src[inaddr]];
+		pos_x +=inc_x;
+		int off = (pos_x >= width ? -1 : 0);
+		pos_x =  mad24(off,width,pos_x);
+		pos_y += inc_y - off;
+	}
+}
+*/
+
--- a/modules/ocl/src/kernels/imgproc_integral.cl
+++ b/modules/ocl/src/kernels/imgproc_integral.cl
@ -0,0 +1,269 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#define LSIZE 256
+#define LSIZE_1 255
+#define LSIZE_2 254
+#define HF_LSIZE 128
+#define LOG_LSIZE 8
+#define LOG_NUM_BANKS 5
+#define NUM_BANKS 32
+#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
+
+
+kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float *sqsum,
+                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    float4 sqsum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
+    __local int* sum_p;
+    __local float* sqsum_p;
+    src_step = src_step >> 2;
+    gid = gid << 1;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
+        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
+        
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] =  (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+        lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
+
+        lm_sum[1][bf_loc] = src_t[1];
+        lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
+        
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+            
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+                
+                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
+            }
+        }
+        if(lid > 0 & (i+lid) <= rows){
+            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
+            lm_sum[0][bf_loc] += sum_t[0]; 
+            lm_sum[1][bf_loc] += sum_t[1]; 
+            lm_sqsum[0][bf_loc] += sqsum_t[0];
+            lm_sqsum[1][bf_loc] += sqsum_t[1];
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
+                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
+                sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
+            } 
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
+                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
+                sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
+            } 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+
+kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
+                          __global float *sqsum,int rows,int cols,int src_step,int sum_step,
+                          int sqsum_step,int sum_offset,int sqsum_offset)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    float4 sqsrc_t[2],sqsum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
+    __local int *sum_p;
+    __local float *sqsum_p;
+    src_step = src_step >> 4;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
+        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : 0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
+        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : 0;
+        
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] =  (i == 0 ? 0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? 0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+        lm_sqsum[0][bf_loc] = sqsrc_t[0];
+            
+        lm_sum[1][bf_loc] = src_t[1];
+        lm_sqsum[1][bf_loc] = sqsrc_t[1];
+        
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+            
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+                
+                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
+            }
+        }
+        
+        if(gid == 0 && (i + lid) <= rows)
+        {
+           sum[sum_offset + i + lid] = 0;
+           sqsum[sqsum_offset + i + lid] = 0;
+        }
+        if(i + lid == 0)
+        {
+            int loc0 = gid * 2 * sum_step;
+            int loc1 = gid * 2 * sqsum_step;
+            for(int k = 1;k <= 8;k++) 
+            {
+                if(gid * 8 + k > cols) break;
+                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
+                sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
+            }
+        }
+        
+        if(lid > 0 & (i+lid) <= rows){
+            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
+            int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
+            lm_sum[0][bf_loc] += sum_t[0]; 
+            lm_sum[1][bf_loc] += sum_t[1]; 
+            lm_sqsum[0][bf_loc] += sqsum_t[0];
+            lm_sqsum[1][bf_loc] += sqsum_t[1];
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + k >= cols) break;
+                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
+                sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
+            } 
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + 4 + k >= cols) break;
+                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
+                sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
+            } 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
--- a/modules/ocl/src/kernels/imgproc_integral_sum.cl
+++ b/modules/ocl/src/kernels/imgproc_integral_sum.cl
@ -0,0 +1,227 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#define LSIZE 256
+#define LSIZE_1 255
+#define LSIZE_2 254
+#define HF_LSIZE 128
+#define LOG_LSIZE 8
+#define LOG_NUM_BANKS 5
+#define NUM_BANKS 32
+#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
+
+
+kernel void integral_cols(__global uchar4 *src,__global int *sum ,
+                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local int* sum_p;
+    src_step = src_step >> 2;
+    gid = gid << 1;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
+        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
+        
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+        
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+            
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+            }
+        }
+        if(lid > 0 & (i+lid) <= rows){
+            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
+            lm_sum[0][bf_loc] += sum_t[0]; 
+            lm_sum[1][bf_loc] += sum_t[1]; 
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
+                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
+            } 
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
+                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
+            } 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+
+kernel void integral_rows(__global int4 *srcsum,__global int *sum ,
+                          int rows,int cols,int src_step,int sum_step,
+                          int sum_offset)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local int *sum_p;
+    src_step = src_step >> 4;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
+        
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+            
+        lm_sum[1][bf_loc] = src_t[1];
+        
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai); 
+            bi += GET_CONFLICT_OFFSET(bi); 
+            
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+            }
+        }
+        
+        if(gid == 0 && (i + lid) <= rows)
+        {
+           sum[sum_offset + i + lid] = 0;
+        }
+        if(i + lid == 0)
+        {
+            int loc0 = gid * 2 * sum_step;
+            for(int k = 1;k <= 8;k++) 
+            {
+                if(gid * 8 + k > cols) break;
+                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
+            }
+        }
+        
+        if(lid > 0 & (i+lid) <= rows){
+            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
+            lm_sum[0][bf_loc] += sum_t[0]; 
+            lm_sum[1][bf_loc] += sum_t[1]; 
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + k >= cols) break;
+                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
+            } 
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + 4 + k >= cols) break;
+                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
+            } 
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
--- a/modules/ocl/src/kernels/imgproc_median.cl
+++ b/modules/ocl/src/kernels/imgproc_median.cl
@ -0,0 +1,487 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin, zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+/*
+__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep, int m)
+{
+	int dx = get_global_id(0)-(m>>1);
+    int dy = get_global_id(1)-(m>>1);
+    
+	short histom[256];
+	for(int i=0;i<256;++i)
+		histom[i]=0;
+
+	
+	for(int i=0;i<m;++i)
+	{	
+		__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
+		for(int j=dx;j<dx+m;++j)
+		{
+			histom[data[clamp(j, 0, cols-1)]]++;
+		}
+	}
+
+	int now=0;
+	int goal=(m*m+1)>>1;
+	int v;
+	for(int i=0;i<256;++i)
+	{
+		v=(now<goal?i:v);
+		now+=histom[i];
+	}
+	
+	if(dy<rows && dx<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
+}
+*/
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local uchar4 data[18][18];
+	__global uchar4* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+	int dr=id/18;
+	int dc=id%18;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+9, 0, rows-1);
+	data[dr+9][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+	uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+	uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
+	uchar4 mid;
+
+	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local uchar data[18][18];
+	__global uchar* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+	int dr=id/18;
+	int dc=id%18;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+9, 0, rows-1);
+	data[dr+9][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+	uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+	uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
+	uchar mid;
+
+	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local float data[18][18];
+	__global float* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+	int dr=id/18;
+	int dc=id%18;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+9, 0, rows-1);
+	data[dr+9][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+	float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+	float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
+	float mid;
+
+	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local float4 data[18][18];
+	__global float4* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+	int dr=id/18;
+	int dc=id%18;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+9, 0, rows-1);
+	data[dr+9][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+	float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+	float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; 
+	float4 mid;
+
+	op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local uchar4 data[20][20];
+	__global uchar4* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+	int dr=id/20;
+	int dc=id%20;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+10, 0, rows-1);
+	data[dr+10][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+	uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+	uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+	uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+	uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+	uchar4 mid;
+
+	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local uchar data[20][20];
+	__global uchar* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+	int dr=id/20;
+	int dc=id%20;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+10, 0, rows-1);
+	data[dr+10][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+	uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+	uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+	uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+	uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+	uchar mid;
+
+	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local float4 data[20][20];
+	__global float4* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+	int dr=id/20;
+	int dc=id%20;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+10, 0, rows-1);
+	data[dr+10][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+	float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+	float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+	float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+	float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+	float4 mid;
+
+	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+	
+	__local float data[20][20];
+	__global float* source=src + srcOffset;
+
+	int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+    
+	const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+	int dr=id/20;
+	int dc=id%20;
+	int r=clamp(dy+dr, 0, rows-1);
+	int c=clamp(dx+dc, 0, cols-1);
+
+	data[dr][dc] = source[r*srcStep + c];
+	r=clamp(dy+dr+10, 0, rows-1);
+	data[dr+10][dc] = source[r*srcStep + c];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int x =get_local_id(0);
+	int y =get_local_id(1);
+	float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+	float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+	float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+	float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+	float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+	float mid;
+
+	op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+	
+	if(get_global_id(1)<rows && get_global_id(0)<cols)
+		dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
--- a/modules/ocl/src/kernels/imgproc_remap.cl
+++ b/modules/ocl/src/kernels/imgproc_remap.cl
@ -0,0 +1,555 @@
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wu Zailong, bullet@yeah.net
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+__kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    /*
+    if(x < dst_cols && y < dst_rows)
+    {
+        int dstIdx = y * dst_step + x + dst_offset;
+        int map1Idx = y * (map1_step>>2) + x + (map1_offset>>2) - (map1_offset & 1);
+        short2 map1_data = *(map1 + map1Idx);
+        int srcIdx = map1_data.y*src_step+map1_data.x + src_offset;       
+        uchar src_data = *(src +srcIdx);      
+        uchar dst_data = src_data; 
+        *(dst +dstIdx)=(map1_data.x >= map1_cols || map1_data.y >= map1_rows) ? val : dst_data;
+    }
+    */
+    
+    int gx = (x << 2) - (dst_offset&3);
+    int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
+
+    uchar4 nval =convert_uchar4(nVal);
+    char val = nval.s0;
+
+    x = x << 2;
+
+    int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
+
+    int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2);
+    short8 map1_data;
+
+     map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start));
+     map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4));
+     map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8));
+     map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12));
+    
+    int4 srcIdx ;
+    srcIdx.s0 = map1_data.s1 * src_step + map1_data.s0 + src_offset;
+    srcIdx.s1 = map1_data.s3 * src_step + map1_data.s2 + src_offset;
+    srcIdx.s2 = map1_data.s5 * src_step + map1_data.s4 + src_offset;
+    srcIdx.s3 = map1_data.s7 * src_step + map1_data.s6 + src_offset;
+    
+        //uchar4 src_data = *(src + srcIdx);
+    uchar4 src_data;
+    src_data.s0 = *(src + srcIdx.s0);
+    src_data.s1 = *(src + srcIdx.s1);
+    src_data.s2 = *(src + srcIdx.s2);
+    src_data.s3 = *(src + srcIdx.s3);
+
+    uchar4 dst_data;
+    dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
+    dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
+    dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
+    dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
+    
+    __global uchar4* d = (__global uchar4 *)(dst + dstStart);
+
+    uchar4 dVal = *d;      
+    int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal;
+
+    *d = dst_data;
+}
+
+__kernel void remapNNSConstant_C2_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int gx = (x << 3) - (dst_offset&7);
+    int8 Gx = (int8)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7);
+
+    uchar4 nval =convert_uchar4(nVal);
+    uchar2 val = nval.s01;//testing
+
+    x = x << 3;
+
+    int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&7);
+
+    int map1Start = y * map1_step + (x << 1) + map1_offset - (((dst_offset>>1) & 3) << 2);
+    short8 map1_data;
+
+     map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start));
+     map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4));
+     map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8));
+     map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12));
+    
+    int4 srcIdx ;
+    srcIdx.s0 = map1_data.s1 * src_step + (map1_data.s0 << 1) + src_offset;
+    srcIdx.s1 = map1_data.s3 * src_step + (map1_data.s2 << 1) + src_offset;
+    srcIdx.s2 = map1_data.s5 * src_step + (map1_data.s4 << 1) + src_offset;
+    srcIdx.s3 = map1_data.s7 * src_step + (map1_data.s6 << 1) + src_offset;
+    
+        //uchar4 src_data = *(src + srcIdx);
+    uchar8 src_data;
+    src_data.s01 = *((__global uchar2 *)((__global char*)src + srcIdx.s0));
+    src_data.s23 = *((__global uchar2 *)((__global char*)src + srcIdx.s1));
+    src_data.s45 = *((__global uchar2 *)((__global char*)src + srcIdx.s2));
+    src_data.s67 = *((__global uchar2 *)((__global char*)src + srcIdx.s3));
+
+    uchar8 dst_data;
+    dst_data.s01 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows) ? val : (convert_uchar2(src_data.s01));
+    dst_data.s23 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows) ? val : (convert_uchar2(src_data.s23));
+    dst_data.s45 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows) ? val : (convert_uchar2(src_data.s45));
+    dst_data.s67 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows) ? val : (convert_uchar2(src_data.s67));
+    __global uchar8* d = (__global uchar8 *)(dst + dstStart);
+
+    uchar8 dVal = *d;      
+    int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal;
+    *d = dst_data;
+}
+__kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int gx = (x << 4) - (dst_offset&15);
+    int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
+
+    uchar4 nval =convert_uchar4(nVal);
+
+    x = x << 4;
+
+    int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
+
+    int map1Start = y * map1_step + x + map1_offset - (((dst_offset>>2) & 3) << 2);
+    short8 map1_data;
+
+     map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start));
+     map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4));
+     map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8));
+     map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12));
+    
+    int4 srcIdx ;
+    srcIdx.s0 = map1_data.s1 * src_step + (map1_data.s0 << 2) + src_offset;
+    srcIdx.s1 = map1_data.s3 * src_step + (map1_data.s2 << 2) + src_offset;
+    srcIdx.s2 = map1_data.s5 * src_step + (map1_data.s4 << 2) + src_offset;
+    srcIdx.s3 = map1_data.s7 * src_step + (map1_data.s6 << 2) + src_offset;
+    
+  //  uchar16 src_data;
+    uchar4 src_a, src_b, src_c, src_d;
+    src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
+    src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
+    src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
+    src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
+  //  src_data = (uchar16)(src_a, src_b, src_c, src_d);
+    uchar16 dst_data;
+    uchar4 dst_a, dst_b, dst_c, dst_d;
+    dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows) ? nval : src_a;
+    dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows) ? nval : src_b;
+    dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows) ? nval : src_c;
+    dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows) ? nval : src_d;
+    dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
+    __global uchar16* d = (__global uchar16 *)(dst + dstStart);
+
+    uchar16 dVal = *d;      
+    int16 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar16(con) != 0) ? dst_data : dVal;
+
+    *d = dst_data;
+}
+
+__kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+     
+    int gx = (x << 2) - (dst_offset&3);
+    int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
+
+    uchar4 nval =convert_uchar4_sat_rte(nVal);
+    char val = nval.s0;
+
+    x = x << 2;
+
+    int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
+
+    int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3);
+    float8 map1_data;
+
+    map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
+ /*   map1_data.s01 = *((__global float2 *)((__global char*)map1 + map1Start));
+    map1_data.s23 = *((__global float2 *)((__global char*)map1 + map1Start + 8));
+    map1_data.s45 = *((__global float2 *)((__global char*)map1 + map1Start + 16));
+    map1_data.s67 = *((__global float2 *)((__global char*)map1 + map1Start + 24));
+*/
+    int8 map1_dataZ;
+
+    map1_dataZ = convert_int8_sat_rte(map1_data);
+
+    int4 srcIdx ;
+    srcIdx.s0 = map1_dataZ.s1 * src_step + map1_dataZ.s0 + src_offset;
+    srcIdx.s1 = map1_dataZ.s3 * src_step + map1_dataZ.s2 + src_offset;
+    srcIdx.s2 = map1_dataZ.s5 * src_step + map1_dataZ.s4 + src_offset;
+    srcIdx.s3 = map1_dataZ.s7 * src_step + map1_dataZ.s6 + src_offset;
+    
+        //uchar4 src_data = *(src + srcIdx);
+    uchar4 src_data;
+    src_data.s0 = *(src + srcIdx.s0);
+    src_data.s1 = *(src + srcIdx.s1);
+    src_data.s2 = *(src + srcIdx.s2);
+    src_data.s3 = *(src + srcIdx.s3);
+
+    uchar4 dst_data;
+    dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
+    dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
+    dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
+    dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
+    
+    __global uchar4* d = (__global uchar4 *)(dst + dstStart);
+
+    uchar4 dVal = *d;      
+    int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal;
+
+    *d = dst_data;
+}
+
+
+__kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+     
+    int gx = (x << 2) - (dst_offset&3);
+    int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
+
+    uchar4 nval =convert_uchar4(nVal);
+    uchar val = nval.s0;
+  
+    x = x << 2;
+
+    int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
+
+    int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3);
+    float8 map1_data;
+
+    map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
+    int8 map1_dataD = convert_int8(map1_data);
+    float8 temp = map1_data - convert_float8(map1_dataD);
+
+    float4 u = temp.even;
+    float4 v = temp.odd;
+    float4 ud = 1.0 - u;
+    float4 vd = 1.0 - v;
+    //float8 map1_dataU = map1_dataD + 1;
+
+    int4 map1_dataDx = map1_dataD.even;
+    int4 map1_dataDy = map1_dataD.odd;
+    int4 map1_dataDx1 = map1_dataDx + 1;
+    int4 map1_dataDy1 = map1_dataDy + 1;
+
+    int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
+    int4 src_StartD = src_StartU + src_step;
+    int4 src_StartU1 = src_StartU + 1;
+    int4 src_StartD1 = src_StartD + 1;
+
+    uchar4 a, b, c, d;
+    a.x = *(src_StartU.x + src);
+    a.y = *(src_StartU.y + src);
+    a.z = *(src_StartU.z + src);
+    a.w = *(src_StartU.w + src);
+    
+    b.x = *(src_StartU1.x + src);
+    b.y = *(src_StartU1.y + src);
+    b.z = *(src_StartU1.z + src);
+    b.w = *(src_StartU1.w + src);
+
+    c.x = *(src_StartD.x + src);
+    c.y = *(src_StartD.y + src);
+    c.z = *(src_StartD.z + src);
+    c.w = *(src_StartD.w + src);
+
+    d.x = *(src_StartD1.x + src);
+    d.y = *(src_StartD1.y + src);
+    d.z = *(src_StartD1.z + src);
+    d.w = *(src_StartD1.w + src);
+    int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
+    int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
+    int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
+    int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
+    a = (convert_uchar4(ac) == 0)? a : val;
+    b = (convert_uchar4(bc) == 0)? b : val;
+    c = (convert_uchar4(cc) == 0)? c : val;
+    d = (convert_uchar4(dc) == 0)? d : val;
+
+    uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
+    
+    __global uchar4* D = (__global uchar4 *)(dst + dstStart);
+
+    uchar4 dVal = *D;      
+    int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal;
+
+    *D = dst_data;
+}
+
+
+__kernel void remapLNFConstant_C2_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+     
+    int gx = (x << 3) - (dst_offset&7);
+    int8 Gx = (int8)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7);
+
+    uchar4 nval =convert_uchar4(nVal);
+    uchar8 val = (uchar8)(nval.s01, nval.s01, nval.s01, nval.s01);
+  
+    x = x << 3;
+
+    int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&7);
+
+    int map1Start = y * map1_step + (x << 2) + map1_offset - (((dst_offset>>1) & 3) << 3);
+    float8 map1_data;
+
+    map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
+    int8 map1_dataD = convert_int8(map1_data);
+    float8 temp = map1_data - convert_float8(map1_dataD);
+
+    float4 U = temp.even;
+    float4 V = temp.odd;
+    float4 UD = 1.0 - U;
+    float4 VD = 1.0 - V;
+
+    float8 u, v, ud, vd;
+    u = (float8)(U.x, U.x, U.y, U.y, U.z, U.z, U.w, U.w);
+    v = (float8)(V.x, V.x, V.y, V.y, V.z, V.z, V.w, V.w);
+    ud = (float8)(UD.x, UD.x, UD.y, UD.y, UD.z, UD.z, UD.w, UD.w);
+    vd = (float8)(VD.x, VD.x, VD.y, VD.y, VD.z, VD.z, VD.w, VD.w);
+
+    //float8 map1_dataU = map1_dataD + 1;
+
+    int4 map1_dataDx = map1_dataD.even;
+    int4 map1_dataDy = map1_dataD.odd;
+    int4 map1_dataDx1 = map1_dataDx + 1;
+    int4 map1_dataDy1 = map1_dataDy + 1;
+
+    int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 1) + src_offset;
+    int4 src_StartD = src_StartU + src_step;
+    int4 src_StartU1 = src_StartU + 2;
+    int4 src_StartD1 = src_StartD + 2;
+
+    uchar8 a, b, c, d;
+    a.s01 = *((__global uchar2 *)((__global char*)src + src_StartU.x));
+    a.s23 = *((__global uchar2 *)((__global char*)src + src_StartU.y));
+    a.s45 = *((__global uchar2 *)((__global char*)src + src_StartU.z));
+    a.s67 = *((__global uchar2 *)((__global char*)src + src_StartU.w));
+
+    b.s01 = *((__global uchar2 *)((__global char*)src + src_StartU1.x));
+    b.s23 = *((__global uchar2 *)((__global char*)src + src_StartU1.y));
+    b.s45 = *((__global uchar2 *)((__global char*)src + src_StartU1.z));
+    b.s67 = *((__global uchar2 *)((__global char*)src + src_StartU1.w));
+
+    c.s01 = *((__global uchar2 *)((__global char*)src + src_StartD.x));
+    c.s23 = *((__global uchar2 *)((__global char*)src + src_StartD.y));
+    c.s45 = *((__global uchar2 *)((__global char*)src + src_StartD.z));
+    c.s67 = *((__global uchar2 *)((__global char*)src + src_StartD.w));
+
+    d.s01 = *((__global uchar2 *)((__global char*)src + src_StartD1.x));
+    d.s23 = *((__global uchar2 *)((__global char*)src + src_StartD1.y));
+    d.s45 = *((__global uchar2 *)((__global char*)src + src_StartD1.z));
+    d.s67 = *((__global uchar2 *)((__global char*)src + src_StartD1.w));
+
+    int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
+    int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
+    int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
+    int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
+
+ /*   a.even = (convert_uchar4(ac) == 0)? a.even : val.even;
+    a.odd = (convert_uchar4(ac) == 0)? a.odd : val.odd;
+    b.even = (convert_uchar4(bc) == 0)? b.even : val.even;
+    b.odd = (convert_uchar4(bc) == 0)? b.odd : val.odd;
+    c.even = (convert_uchar4(cc) == 0)? c.even : val.even;
+    c.odd = (convert_uchar4(cc) == 0)? c.odd : val.odd;
+    d.even = (convert_uchar4(dc) == 0)? d.even : val.even;
+    d.odd = (convert_uchar4(dc) == 0)? d.odd : val.odd;
+*/
+    int8 aC = (int8)(ac.x, ac.x, ac.y, ac.y, ac.z, ac.z, ac.w, ac.w);
+    int8 bC = (int8)(bc.x, bc.x, bc.y, bc.y, bc.z, bc.z, bc.w, bc.w);
+    int8 cC = (int8)(cc.x, cc.x, cc.y, cc.y, cc.z, cc.z, cc.w, cc.w);
+    int8 dC = (int8)(dc.x, dc.x, dc.y, dc.y, dc.z, dc.z, dc.w, dc.w);
+
+    a = (convert_uchar8(aC) == 0)? a : val;
+    b = (convert_uchar8(bC) == 0)? b : val;
+    c = (convert_uchar8(cC) == 0)? c : val;
+    d = (convert_uchar8(dC) == 0)? d : val;
+    uchar8 dst_data = convert_uchar8_sat_rte((convert_float8(a))* ud * vd +(convert_float8(b))* u * vd + (convert_float8(c))* ud * v + (convert_float8(d)) * u * v );
+    
+    __global uchar8* D = (__global uchar8 *)(dst + dstStart);
+
+    uchar8 dVal = *D;      
+    int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal;
+
+    *D = dst_data;
+}
+
+/*
+__kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict  src,
+        __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
+        int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+     
+    int gx = (x << 4) - (dst_offset&15);
+    int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
+
+    uchar4 nval =convert_uchar4(nVal);
+    uchar16 val = (uchar16)(nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01);
+  
+    x = x << 4;
+
+    int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&15);
+
+    int map1Start = y * map1_step + (x << 1) + map1_offset - (((dst_offset>>2) & 3) << 3);
+    float8 map1_data;
+
+    map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
+    int8 map1_dataD = convert_int8(map1_data);
+    float8 temp = map1_data - convert_float8(map1_dataD);
+
+    float4 U = temp.even;
+    float4 V = temp.odd;
+    float4 UD = 1.0 - U;
+    float4 VD = 1.0 - V;
+
+    float16 u, v, ud, vd;
+    u = (float16)(U.x, U.x, U.x, U.x, U.y, U.y, U.y, U.y, U.z, U.z, U.z, U.z, U.w, U.w, U.w, U.w);
+    v = (float16)(V.x, V.x, V.x, V.x, V.y, V.y, V.y, V.y, V.z, V.z, V.z, V.z, V.w, V.w, V.w, V.w);
+    ud = (float16)(UD.x, UD.x, UD.x, UD.x, UD.y, UD.y, UD.y, UD.y, UD.z, UD.z, UD.z, UD.z, UD.w, UD.w, UD.w, UD.w);
+    vd = (float16)(VD.x, VD.x, VD.y, VD.y, VD.z, VD.z, VD.w, VD.w);
+
+    //float8 map1_dataU = map1_dataD + 1;
+
+    int4 map1_dataDx = map1_dataD.even;
+    int4 map1_dataDy = map1_dataD.odd;
+    int4 map1_dataDx1 = map1_dataDx + 1;
+    int4 map1_dataDy1 = map1_dataDy + 1;
+
+    int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 1) + src_offset;
+    int4 src_StartD = src_StartU + src_step;
+    int4 src_StartU1 = src_StartU + 2;
+    int4 src_StartD1 = src_StartD + 2;
+
+    uchar8 a, b, c, d;
+    a.s01 = *((__global uchar2 *)((__global char*)src + src_StartU.x));
+    a.s23 = *((__global uchar2 *)((__global char*)src + src_StartU.y));
+    a.s45 = *((__global uchar2 *)((__global char*)src + src_StartU.z));
+    a.s67 = *((__global uchar2 *)((__global char*)src + src_StartU.w));
+
+    b.s01 = *((__global uchar2 *)((__global char*)src + src_StartU1.x));
+    b.s23 = *((__global uchar2 *)((__global char*)src + src_StartU1.y));
+    b.s45 = *((__global uchar2 *)((__global char*)src + src_StartU1.z));
+    b.s67 = *((__global uchar2 *)((__global char*)src + src_StartU1.w));
+
+    c.s01 = *((__global uchar2 *)((__global char*)src + src_StartD.x));
+    c.s23 = *((__global uchar2 *)((__global char*)src + src_StartD.y));
+    c.s45 = *((__global uchar2 *)((__global char*)src + src_StartD.z));
+    c.s67 = *((__global uchar2 *)((__global char*)src + src_StartD.w));
+
+    d.s01 = *((__global uchar2 *)((__global char*)src + src_StartD1.x));
+    d.s23 = *((__global uchar2 *)((__global char*)src + src_StartD1.y));
+    d.s45 = *((__global uchar2 *)((__global char*)src + src_StartD1.z));
+    d.s67 = *((__global uchar2 *)((__global char*)src + src_StartD1.w));
+
+    int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
+    int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
+    int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
+    int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
+
+    int8 aC = (int8)(ac.x, ac.x, ac.y, ac.y, ac.z, ac.z, ac.w, ac.w);
+    int8 bC = (int8)(bc.x, bc.x, bc.y, bc.y, bc.z, bc.z, bc.w, bc.w);
+    int8 cC = (int8)(cc.x, cc.x, cc.y, cc.y, cc.z, cc.z, cc.w, cc.w);
+    int8 dC = (int8)(dc.x, dc.x, dc.y, dc.y, dc.z, dc.z, dc.w, dc.w);
+
+    a = (convert_uchar8(aC) == 0)? a : val;
+    b = (convert_uchar8(bC) == 0)? b : val;
+    c = (convert_uchar8(cC) == 0)? c : val;
+    d = (convert_uchar8(dC) == 0)? d : val;
+    uchar8 dst_data = convert_uchar8_sat_rte((convert_float8(a))* ud * vd +(convert_float8(b))* u * vd + (convert_float8(c))* ud * v + (convert_float8(d)) * u * v );
+    
+    __global uchar8* D = (__global uchar8 *)(dst + dstStart);
+
+    uchar8 dVal = *D;      
+    int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows);
+    dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal;
+
+    *D = dst_data;
+    
+}
+*/
+
--- a/modules/ocl/src/kernels/imgproc_resize.cl
+++ b/modules/ocl/src/kernels/imgproc_resize.cl
@ -0,0 +1,353 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+// resize kernel 
+// Currently, CV_8UC1  CV_8UC4  CV_32FC1 and CV_32FC4are supported.
+// We shall support other types later if necessary.
+
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+typedef double F ;
+#else 
+typedef float F;
+#endif
+
+inline uint4 getPoint_8uc4(__global uchar4 * data, int offset, int x, int y, int step)
+{
+    return convert_uint4(data[(offset>>2)+ y * (step>>2) + x]);
+}
+
+inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int step)
+{
+    return data[(offset>>2)+ y * (step>>2) + x];
+}
+
+
+#define INTER_RESIZE_COEF_BITS 11
+#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
+#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
+#define CAST_SCALE (1.0f/(1<<CAST_BITS))
+#define INC(x,l) ((x+1) >= (l) ? (x):((x)+1))
+
+__kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned char const * restrict src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int gx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    float4  sx, u, xf;
+    int4 x, DX;
+    gx = (gx<<2) - (dst_offset&3);
+    DX = (int4)(gx, gx+1, gx+2, gx+3);
+    sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f;
+    xf = floor(sx);
+    x = convert_int4(xf);
+    u = sx - xf;
+    float sy = ((dy+0.5f) * ify - 0.5f);
+    int y = floor(sy);
+    float v = sy - y;
+ 
+    u = x < 0 ? 0 : u;
+    u = (x >= src_cols) ? 0 : u;
+    x = x < 0 ? 0 : x;
+    x = (x >= src_cols) ? src_cols-1 : x;
+ 
+    y<0 ? y=0,v=0 : y;
+    y>=src_rows ? y=src_rows-1,v=0 : y;
+ 
+    int4 U, U1;
+    int V, V1;
+    float4 utmp1, utmp2;
+    float vtmp;
+    float4 scale_vec = INTER_RESIZE_COEF_SCALE;
+    utmp1 = u * scale_vec;
+    utmp2 = scale_vec - utmp1;
+    U = convert_int4(rint(utmp1)); 
+    U1 = convert_int4(rint(utmp2)); 
+    vtmp = v * INTER_RESIZE_COEF_SCALE;
+    V = rint(vtmp);
+    V1= rint(INTER_RESIZE_COEF_SCALE - vtmp);
+
+    int y_ = INC(y,src_rows);
+    int4 x_;
+    x_ =  ((x+1 >= src_cols) != 0) ? x : x+1;
+
+    int4 val1, val2, val;
+    int4 sdata1, sdata2, sdata3, sdata4;
+
+    int4 pos1 = src_offset + y * src_step + x;
+    int4 pos2 = src_offset + y * src_step + x_;
+    int4 pos3 = src_offset + y_ * src_step + x;
+    int4 pos4 = src_offset + y_ * src_step + x_;
+
+    sdata1.s0 = src[pos1.s0];
+    sdata1.s1 = src[pos1.s1];
+    sdata1.s2 = src[pos1.s2];
+    sdata1.s3 = src[pos1.s3];
+
+    sdata2.s0 = src[pos2.s0];
+    sdata2.s1 = src[pos2.s1];
+    sdata2.s2 = src[pos2.s2];
+    sdata2.s3 = src[pos2.s3];
+
+    sdata3.s0 = src[pos3.s0];
+    sdata3.s1 = src[pos3.s1];
+    sdata3.s2 = src[pos3.s2];
+    sdata3.s3 = src[pos3.s3];
+
+    sdata4.s0 = src[pos4.s0];
+    sdata4.s1 = src[pos4.s1];
+    sdata4.s2 = src[pos4.s2];
+    sdata4.s3 = src[pos4.s3];
+
+    val1 = U1 * sdata1 + U * sdata2;
+    val2 = U1 * sdata3 + U * sdata4;
+    val = V1 * val1 + V * val2;
+    
+    __global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx);
+    uchar4 dVal = *d;
+    int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows);
+    val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS);
+    *d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal;
+    
+}
+
+__kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
+    int x = floor(sx), y = floor(sy);
+    float u = sx - x, v = sy - y;
+
+    x<0 ? x=0,u=0 : x,u;
+    x>=src_cols ? x=src_cols-1,u=0 : x,u;
+    y<0 ? y=0,v=0 : y,v;
+    y>=src_rows ? y=src_rows-1,v=0 : y,v;
+    
+    u = u * INTER_RESIZE_COEF_SCALE;
+    v = v * INTER_RESIZE_COEF_SCALE;
+   
+    int U = rint(u);
+    int V = rint(v);
+    int U1= rint(INTER_RESIZE_COEF_SCALE - u);
+    int V1= rint(INTER_RESIZE_COEF_SCALE - v);
+
+    int y_ = INC(y,src_rows);
+    int x_ = INC(x,src_cols);
+      
+    uint4 val = U1* V1 *  getPoint_8uc4(src,src_offset,x,y,src_step) +
+               U1* V  *  getPoint_8uc4(src,src_offset,x,y_,src_step) +
+               U * V1 *  getPoint_8uc4(src,src_offset,x_,y,src_step) +
+               U * V  *  getPoint_8uc4(src,src_offset,x_,y_,src_step);
+               
+    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
+         dst[(dst_offset>>2) + dy * (dst_step>>2) + dx] = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
+}
+
+__kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
+    int x = floor(sx), y = floor(sy);
+    float u = sx - x, v = sy - y;
+
+    x<0 ? x=0,u=0 : x,u;
+    x>=src_cols ? x=src_cols-1,u=0 : x,u;
+    y<0 ? y=0,v=0 : y,v;
+    y>=src_rows ? y=src_rows-1,v=0 : y,v;
+    
+    int y_ = INC(y,src_rows);
+    int x_ = INC(x,src_cols);
+
+    float val1 = (1.0f-u) *  getPoint_32fc1(src,src_offset,x,y,src_step) +
+                u  *  getPoint_32fc1(src,src_offset,x_,y,src_step) ;
+    float val2 = (1.0f-u) *  getPoint_32fc1(src,src_offset,x,y_,src_step) +
+                u *  getPoint_32fc1(src,src_offset,x_,y_,src_step);
+    float val = (1.0f-v) * val1 + v * val2;
+
+    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
+         dst[(dst_offset>>2) + dy * (dst_step>>2) + dx] = val; 
+}
+
+__kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
+    int x = floor(sx), y = floor(sy);
+    float u = sx - x, v = sy - y;
+
+    x<0 ? x=0,u=0 : x;
+    x>=src_cols ? x=src_cols-1,u=0 : x;
+    y<0 ? y=0,v=0 : y;
+    y>=src_rows ? y=src_rows-1,v=0 : y;
+    
+    int y_ = INC(y,src_rows);
+    int x_ = INC(x,src_cols);
+
+    float4 s_data1, s_data2, s_data3, s_data4;
+    src_offset = (src_offset >> 4);
+    src_step = (src_step >> 4);
+    s_data1 = src[src_offset + y*src_step + x];
+    s_data2 = src[src_offset + y*src_step + x_];
+    s_data3 = src[src_offset + y_*src_step + x];
+    s_data4 = src[src_offset + y_*src_step + x_];
+    s_data1 = (1.0f-u) * s_data1 + u * s_data2;
+    s_data2 = (1.0f-u) * s_data3 + u * s_data4;
+    s_data3 = (1.0f-v) * s_data1 + v * s_data2;
+
+    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
+         dst[(dst_offset>>4) + dy * (dst_step>>4) + dx] = s_data3; 
+}
+
+__kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
+{
+    int gx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    gx = (gx<<2) - (dst_offset&3);
+    int4 GX = (int4)(gx, gx+1, gx+2, gx+3);
+    
+    int4 sx;
+    int sy;
+    F ss1 = gx*ifx;
+    F ss2 = (gx+1)*ifx; 
+    F ss3 = (gx+2)*ifx;
+    F ss4 = (gx+3)*ifx;
+    F s5 = dy * ify;
+    sx.s0 = min((int)floor(ss1), src_cols-1);
+    sx.s1 = min((int)floor(ss2), src_cols-1);
+    sx.s2 = min((int)floor(ss3), src_cols-1);
+    sx.s3 = min((int)floor(ss4), src_cols-1);
+    sy = min((int)floor(s5), src_rows-1);
+    
+    uchar4 val;
+    int4 pos = src_offset + sy * src_step + sx;
+    val.s0 = src[pos.s0];
+    val.s1 = src[pos.s1];
+    val.s2 = src[pos.s2];
+    val.s3 = src[pos.s3];
+    
+    __global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx);
+    uchar4 dVal = *d;
+    int4 con = (GX >= 0 && GX < dst_cols && dy >= 0 && dy < dst_rows);
+    val = convert_uchar4(con != 0) ? val : dVal;
+    
+    *d = val;
+}
+
+__kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    F s1 = dx*ifx;
+    F s2 = dy*ify;
+    int sx = fmin((float)floor(s1), (float)src_cols-1);
+    int sy = fmin((float)floor(s2), (float)src_rows-1);
+    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
+    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
+    
+    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
+        dst[dpos] = src[spos];
+   
+}
+
+__kernel void resizeNN_C1_D5(__global float * dst, __global float * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    F s1 = dx*ifx;
+    F s2 = dy*ify;
+    int sx = fmin((float)floor(s1), (float)src_cols-1);
+    int sy = fmin((float)floor(s2), (float)src_rows-1);
+    int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
+    int spos = (src_offset>>2) + sy * (src_step>>2) + sx;
+    
+    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
+        dst[dpos] = src[spos];
+   
+}
+
+__kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
+                     int dst_offset, int src_offset,int dst_step, int src_step, 
+                     int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    F s1 = dx*ifx;
+    F s2 = dy*ify;
+    int s_col = floor(s1);
+    int s_row = floor(s2);
+    int sx = min(s_col, src_cols-1);
+    int sy = min(s_row, src_rows-1);
+    int dpos = (dst_offset>>4) + dy * (dst_step>>4) + dx;
+    int spos = (src_offset>>4) + sy * (src_step>>4) + sx;
+    
+    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
+        dst[dpos] = src[spos];
+   
+}
+
--- a/modules/ocl/src/kernels/imgproc_threshold.cl
+++ b/modules/ocl/src/kernels/imgproc_threshold.cl
@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+// threshold type:
+// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
+//       THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
+
+__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst, 
+                              int src_offset, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              uchar thresh, uchar max_val, int thresh_type
+                              )
+{
+    int gx = get_global_id(0);
+    const int gy = get_global_id(1);
+
+	int offset = (dst_offset & 15);
+	src_offset -= offset;
+	
+	int dstart = (gx << 4) - offset;
+    if(dstart < dst_cols && gy < dst_rows)
+    {
+   	 	uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
+        uchar16 ddata;
+		uchar16 zero = 0;
+        switch (thresh_type)
+        {
+            case 0:
+                ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
+                break;
+            case 1:
+                ddata = ((sdata > thresh)) ? zero  : (uchar16)(max_val);
+                break;
+            case 2:
+                ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
+                break;
+            case 3:
+                ddata = ((sdata > thresh)) ? sdata : zero;
+                break;
+            case 4:
+                ddata = ((sdata > thresh)) ? zero : sdata;
+                break;
+            default:
+                ddata = sdata;
+        }
+	    int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
+		                     dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);	
+		uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
+		int16 con = dpos >= 0 && dpos < dst_cols;
+		ddata = convert_uchar16(con != 0) ? ddata : dVal;
+		if(dstart < dst_cols)
+		{
+			*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+		}
+    }
+}
+
+
+__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst, 
+                              int src_offset, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              float thresh, float max_val, int thresh_type
+                              )
+{
+    const int gx = get_global_id(0);
+    const int gy = get_global_id(1);
+    
+	int offset = (dst_offset & 3);
+	src_offset -= offset;
+	
+	int dstart = (gx << 2) - offset;
+    if(dstart < dst_cols && gy < dst_rows)
+    {
+        float4 sdata = vload4(gx, src+src_offset+gy*src_step);
+        float4 ddata;
+		float4 zero = 0;
+        switch (thresh_type)
+        {
+            case 0:
+                ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
+                break;
+            case 1:
+                ddata = sdata > thresh ? zero : (float4)max_val;
+                break;
+            case 2:
+                ddata = sdata > thresh ? (float4)thresh : sdata;
+                break;
+            case 3:
+                ddata = sdata > thresh ? sdata : (float4)(0.f);
+                break;
+            case 4:
+                ddata = sdata > thresh ? (float4)(0.f) : sdata;
+                break;
+            default:
+                ddata = sdata;
+        }
+	    int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
+		float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
+		int4 con = dpos >= 0 && dpos < dst_cols;
+		ddata = convert_float4(con) != 0 ? ddata : dVal;
+		if(dstart < dst_cols)
+		{
+			*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+		}
+    }
+}
+
--- a/modules/ocl/src/kernels/imgproc_warpAffine.cl
+++ b/modules/ocl/src/kernels/imgproc_warpAffine.cl
@ -0,0 +1,718 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+//warpAffine kernel
+//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
+
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4
+#else 
+typedef float F;
+typedef float4 F4;
+#define convert_F4 convert_float4
+#endif
+
+
+#define INTER_BITS 5
+#define INTER_TAB_SIZE (1 << INTER_BITS)
+#define INTER_SCALE 1.f/INTER_TAB_SIZE 
+#define AB_BITS max(10, (int)INTER_BITS) 
+#define AB_SCALE (1 << AB_BITS) 
+#define INTER_REMAP_COEF_BITS 15
+#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
+
+inline void interpolateCubic( float x, float* coeffs )
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
+    coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
+    coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+
+/**********************************************8UC1*********************************************
+***********************************************************************************************/
+__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    dx = (dx<<2) - (dst_offset&3);
+    
+    int round_delta = (AB_SCALE>>1);
+  
+    int4 X, Y;
+    int4 sx, sy;
+    int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
+    DX = (DX << AB_BITS);
+    F4 M0DX, M3DX;
+    M0DX = M[0] * convert_F4(DX);
+    M3DX = M[3] * convert_F4(DX);
+    X = convert_int4(rint(M0DX));
+    Y = convert_int4(rint(M3DX));
+    int tmp1, tmp2;
+    tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
+    tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
+     
+    X += tmp1 + round_delta;
+    Y += tmp2 + round_delta;
+   
+    sx = convert_int4(convert_short4(X >> AB_BITS));
+    sy = convert_int4(convert_short4(Y >> AB_BITS));
+    
+    __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
+    uchar4 dval = *d;
+    DX = (int4)(dx, dx+1, dx+2, dx+3);
+    int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
+    int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
+    int4 spos = src_offset + sy * srcStep + sx;
+    uchar4 sval;
+    sval.s0 = scon.s0 ? src[spos.s0] : 0;
+    sval.s1 = scon.s1 ? src[spos.s1] : 0;
+    sval.s2 = scon.s2 ? src[spos.s2] : 0;
+    sval.s3 = scon.s3 ? src[spos.s3] : 0;
+    dval = convert_uchar4(dcon != 0) ? sval : dval;
+    *d = dval;
+}
+
+__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    dx = (dx<<2) - (dst_offset&3);
+     
+    int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
+   
+    int4 X, Y;
+    short4  ax, ay;
+    int4 sx, sy;
+    int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
+    DX = (DX << AB_BITS);
+    F4 M0DX, M3DX;
+    M0DX = M[0] * convert_F4(DX);
+    M3DX = M[3] * convert_F4(DX);
+    X = convert_int4(rint(M0DX));
+    Y = convert_int4(rint(M3DX));
+    
+    int tmp1, tmp2;
+    tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
+    tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
+     
+    X += tmp1 + round_delta;
+    Y += tmp2 + round_delta;
+   
+    X = X >> (AB_BITS - INTER_BITS);
+    Y = Y >> (AB_BITS - INTER_BITS);
+   
+    sx = convert_int4(convert_short4(X >> INTER_BITS));
+    sy = convert_int4(convert_short4(Y >> INTER_BITS));
+    ax = convert_short4(X & (INTER_TAB_SIZE-1));
+    ay = convert_short4(Y & (INTER_TAB_SIZE-1));
+    
+    uchar4 v0, v1, v2,v3;
+    int4 scon0, scon1, scon2, scon3;
+    int4 spos0, spos1, spos2, spos3;
+
+    scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows);
+    scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows);
+    scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows);
+    scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows);
+    spos0 = src_offset + sy * srcStep + sx;
+    spos1 = src_offset + sy * srcStep + sx + 1;
+    spos2 = src_offset + (sy+1) * srcStep + sx;
+    spos3 = src_offset + (sy+1) * srcStep + sx + 1;
+
+    v0.s0 = scon0.s0 ? src[spos0.s0] : 0;
+    v1.s0 = scon1.s0 ? src[spos1.s0] : 0;
+    v2.s0 = scon2.s0 ? src[spos2.s0] : 0;
+    v3.s0 = scon3.s0 ? src[spos3.s0] : 0;
+
+    v0.s1 = scon0.s1 ? src[spos0.s1] : 0;
+    v1.s1 = scon1.s1 ? src[spos1.s1] : 0;
+    v2.s1 = scon2.s1 ? src[spos2.s1] : 0;
+    v3.s1 = scon3.s1 ? src[spos3.s1] : 0;
+
+    v0.s2 = scon0.s2 ? src[spos0.s2] : 0;
+    v1.s2 = scon1.s2 ? src[spos1.s2] : 0;
+    v2.s2 = scon2.s2 ? src[spos2.s2] : 0;
+    v3.s2 = scon3.s2 ? src[spos3.s2] : 0;
+
+    v0.s3 = scon0.s3 ? src[spos0.s3] : 0;
+    v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
+    v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
+    v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
+   
+    short4 itab0, itab1, itab2, itab3;
+    float4 taby, tabx;
+    taby = INTER_SCALE * convert_float4(ay);
+    tabx = INTER_SCALE * convert_float4(ax);
+ 
+    itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+    itab1 = convert_short4_sat(( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
+    itab2 = convert_short4_sat(( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+    itab3 = convert_short4_sat(( taby*tabx * INTER_REMAP_COEF_SCALE ));
+
+
+    int4 val;
+    uchar4 tval;
+    val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) 
+        + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
+    tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    
+    __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
+    uchar4 dval = *d;
+    DX = (int4)(dx, dx+1, dx+2, dx+3);
+    int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
+    dval = convert_uchar4(dcon != 0) ? tval : dval;
+    *d = dval;
+    
+}
+
+__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    int X = X0 >> (AB_BITS - INTER_BITS);
+    int Y = Y0 >> (AB_BITS - INTER_BITS);
+
+    short sx = (short)(X >> INTER_BITS) - 1;
+    short sy = (short)(Y >> INTER_BITS) - 1;
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+    
+    uchar v[16];
+    int i, j;
+   
+#pragma unroll 4
+    for(i=0; i<4;  i++)
+    for(j=0; j<4;  j++)
+    {
+        v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
+    }
+
+    short itab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = 1.f/INTER_TAB_SIZE * ay;
+    axx = 1.f/INTER_TAB_SIZE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+    int isum = 0;
+    
+#pragma unroll 16
+    for( i=0; i<16; i++ )
+    {
+        F v = tab1y[(i>>2)] * tab1x[(i&3)];
+        isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
+    }
+    
+    if( isum != INTER_REMAP_COEF_SCALE )
+    {
+        int k1, k2;
+        int diff = isum - INTER_REMAP_COEF_SCALE;
+        int Mk1=2, Mk2=2, mk1=2, mk2=2;
+        for( k1 = 2; k1 < 4; k1++ )
+            for( k2 = 2; k2 < 4; k2++ )
+            {
+                if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                    mk1 = k1, mk2 = k2;
+                else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                     Mk1 = k1, Mk2 = k2;
+            }
+        diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+    }
+
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        int sum=0;
+        for ( i =0; i<16; i++ )
+        {
+            sum += v[i] * itab[i] ;
+        }
+        dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+/**********************************************8UC4*********************************************
+***********************************************************************************************/
+
+__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = (AB_SCALE >> 1);
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+
+    int sx0 = (short)(X0 >> AB_BITS);
+    int sy0 = (short)(Y0 >> AB_BITS);
+ 
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0; 
+}
+
+__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+    
+    src_offset = (src_offset>>2);
+    srcStep = (srcStep>>2); 
+
+    int tmp = (dx << AB_BITS);
+    int X0 = rint(M[0] * tmp);
+    int Y0 = rint(M[3] * tmp);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    X0 = X0 >> (AB_BITS - INTER_BITS);
+    Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+    short sx0 = (short)(X0 >> INTER_BITS);
+    short sy0 = (short)(Y0 >> INTER_BITS);
+    short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
+    short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
+    
+    int4 v0, v1, v2, v3;
+
+    v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
+    v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0;
+    v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0;
+    v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0;
+
+    int itab0, itab1, itab2, itab3;
+    float taby, tabx;
+    taby = 1.f/INTER_TAB_SIZE*ay0;
+    tabx = 1.f/INTER_TAB_SIZE*ax0;
+    
+    itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+    itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
+    itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+    itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
+    
+    int4 val;
+    val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
+        
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+}
+
+__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
+    
+    src_offset = (src_offset>>2);
+    srcStep = (srcStep>>2); 
+    dst_offset = (dst_offset>>2);
+    dstStep = (dstStep>>2); 
+   
+    int tmp = (dx << AB_BITS);
+    int X0 = rint(M[0] * tmp);
+    int Y0 = rint(M[3] * tmp);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    X0 = X0 >> (AB_BITS - INTER_BITS);
+    Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+    int sx = (short)(X0 >> INTER_BITS) - 1;
+    int sy = (short)(Y0 >> INTER_BITS) - 1;
+    int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
+    int ax = (short)(X0 & (INTER_TAB_SIZE-1));
+    
+    uchar4 v[16];
+    int i,j;
+#pragma unroll 4
+    for(i=0; i<4; i++)
+    for(j=0; j<4; j++)
+    {
+        v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)])  : (uchar4)0;
+    }
+    int itab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = INTER_SCALE * ay;
+    axx = INTER_SCALE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+    int isum = 0;
+    
+#pragma unroll 16
+    for( i=0; i<16; i++ )
+    {
+        float tmp;
+        tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
+        itab[i] = rint(tmp);
+        isum += itab[i];
+    }
+
+    if( isum != INTER_REMAP_COEF_SCALE )
+    {
+        int k1, k2;
+        int diff = isum - INTER_REMAP_COEF_SCALE;
+        int Mk1=2, Mk2=2, mk1=2, mk2=2;
+        
+           for( k1 = 2; k1 < 4; k1++ )
+            for( k2 = 2; k2 < 4; k2++ )
+            {
+                
+                if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                    mk1 = k1, mk2 = k2;
+                else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                     Mk1 = k1, Mk2 = k2;
+            }
+            
+        diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+    }
+
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        int4 sum=0;
+        for ( i =0; i<16; i++ )
+        {
+            sum += convert_int4(v[i]) * itab[i];
+        }
+        dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+
+/**********************************************32FC1********************************************
+***********************************************************************************************/
+
+__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = AB_SCALE/2;
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+
+    short sx0 = (short)(X0 >> AB_BITS);
+    short sy0 = (short)(Y0 >> AB_BITS);
+    
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0; 
+}
+
+__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+    
+    src_offset = (src_offset>>2);
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    X0 = X0 >> (AB_BITS - INTER_BITS);
+    Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+    short sx0 = (short)(X0 >> INTER_BITS);
+    short sy0 = (short)(Y0 >> INTER_BITS);
+    short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
+    short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
+    
+    float v0, v1, v2, v3;
+
+    v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
+    v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
+    v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
+    v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
+
+    float tab[4];
+    float taby[2], tabx[2];
+    taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+    taby[1] = 1.f/INTER_TAB_SIZE*ay0;
+    tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+    tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
+   
+    tab[0] = taby[0] * tabx[0];
+    tab[1] = taby[0] * tabx[1];
+    tab[2] = taby[1] * tabx[0];
+    tab[3] = taby[1] * tabx[1];
+
+    float sum = 0;
+    sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
+}
+    
+__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+    
+    src_offset = (src_offset>>2);
+    dst_offset = (dst_offset>>2);
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    X0 = X0 >> (AB_BITS - INTER_BITS);
+    Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+    short sx = (short)(X0 >> INTER_BITS) - 1;
+    short sy = (short)(Y0 >> INTER_BITS) - 1;
+    short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
+    short ax = (short)(X0 & (INTER_TAB_SIZE-1));
+    
+    float v[16];
+    int i;
+
+    for(i=0; i<16;  i++)
+        v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
+
+    float tab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = 1.f/INTER_TAB_SIZE * ay;
+    axx = 1.f/INTER_TAB_SIZE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+    for( i=0; i<16; i++ )
+    {
+        tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+    }
+    
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        float sum = 0;
+#pragma unroll 4
+        for ( i =0; i<16; i++ )
+        {
+            sum += v[i] * tab[i];
+        }
+        dst[dst_offset+dy*dstStep+dx] = sum;
+
+    }
+}
+
+
+/**********************************************32FC4********************************************
+***********************************************************************************************/
+
+__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = AB_SCALE/2;
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+
+    short sx0 = (short)(X0 >> AB_BITS);
+    short sy0 = (short)(Y0 >> AB_BITS);
+    
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : 0; 
+}
+
+__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+    
+    src_offset = (src_offset>>4);
+    dst_offset = (dst_offset>>4);
+    srcStep = (srcStep>>2);
+    dstStep = (dstStep>>2);
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    X0 = X0 >> (AB_BITS - INTER_BITS);
+    Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+    short sx0 = (short)(X0 >> INTER_BITS);
+    short sy0 = (short)(Y0 >> INTER_BITS);
+    short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
+    short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
+    
+    float4 v0, v1, v2, v3;
+
+    v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
+    v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
+    v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
+    v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
+
+    float tab[4];
+    float taby[2], tabx[2];
+    taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+    taby[1] = 1.f/INTER_TAB_SIZE*ay0;
+    tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+    tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
+   
+    tab[0] = taby[0] * tabx[0];
+    tab[1] = taby[0] * tabx[1];
+    tab[2] = taby[1] * tabx[0];
+    tab[3] = taby[1] * tabx[1];
+
+    float4 sum = 0;
+    sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[dst_offset+dy*dstStep+dx] = sum;
+}
+    
+__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+    
+    src_offset = (src_offset>>4);
+    dst_offset = (dst_offset>>4);
+    srcStep = (srcStep>>2);
+    dstStep = (dstStep>>2);
+    
+    int X0 = rint(M[0] * dx * AB_SCALE);
+    int Y0 = rint(M[3] * dx * AB_SCALE);
+    X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+    Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+    X0 = X0 >> (AB_BITS - INTER_BITS);
+    Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+    short sx = (short)(X0 >> INTER_BITS) - 1;
+    short sy = (short)(Y0 >> INTER_BITS) - 1;
+    short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
+    short ax = (short)(X0 & (INTER_TAB_SIZE-1));
+    
+    float4 v[16];
+    int i;
+
+    for(i=0; i<16;  i++)
+        v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
+
+    float tab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = 1.f/INTER_TAB_SIZE * ay;
+    axx = 1.f/INTER_TAB_SIZE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+    for( i=0; i<16; i++ )
+    {
+        tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+    }
+    
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        float4 sum = 0;
+#pragma unroll 4
+        for ( i =0; i<16; i++ )
+        {
+            sum += v[i] * tab[i];
+        }
+        dst[dst_offset+dy*dstStep+dx] = sum;
+
+    }
+}
--- a/modules/ocl/src/kernels/imgproc_warpPerspective.cl
+++ b/modules/ocl/src/kernels/imgproc_warpPerspective.cl
@ -0,0 +1,648 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+//wrapPerspective kernel
+//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
+
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4
+#else 
+typedef float F;
+typedef float4 F4;
+#define convert_F4 convert_float4
+#endif
+
+
+#define INTER_BITS 5
+#define INTER_TAB_SIZE (1 << INTER_BITS)
+#define INTER_SCALE 1.f/INTER_TAB_SIZE 
+#define AB_BITS max(10, (int)INTER_BITS) 
+#define AB_SCALE (1 << AB_BITS) 
+#define INTER_REMAP_COEF_BITS 15
+#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
+
+inline void interpolateCubic( float x, float* coeffs )
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
+    coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
+    coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+
+/**********************************************8UC1*********************************************
+***********************************************************************************************/
+__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    dx = (dx<<2) - (dst_offset&3);
+    
+    F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
+    F4 X0 = M[0]*DX + M[1]*dy + M[2];
+    F4 Y0 = M[3]*DX + M[4]*dy + M[5];
+    F4 W = M[6]*DX + M[7]*dy + M[8];
+    W = (W!=0) ? 1./W : 0;
+    short4 X = convert_short4(rint(X0*W));
+    short4 Y = convert_short4(rint(Y0*W));
+    int4 sx = convert_int4(X);
+    int4 sy = convert_int4(Y);
+
+    int4 DXD = (int4)(dx, dx+1, dx+2, dx+3);
+    __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
+    uchar4 dval = *d;
+    int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows;
+    int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
+    int4 spos = src_offset + sy * srcStep + sx;
+    uchar4 sval;
+    sval.s0 = scon.s0 ? src[spos.s0] : 0;
+    sval.s1 = scon.s1 ? src[spos.s1] : 0;
+    sval.s2 = scon.s2 ? src[spos.s2] : 0;
+    sval.s3 = scon.s3 ? src[spos.s3] : 0;
+    dval = convert_uchar4(dcon != 0) ? sval : dval;
+    *d = dval;
+
+}
+
+__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, 
+                            int dstStep, int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+ 
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    int sx = (short)(X >> INTER_BITS);
+    int sy = (short)(Y >> INTER_BITS);
+    int ay = (short)(Y & (INTER_TAB_SIZE-1));
+    int ax = (short)(X & (INTER_TAB_SIZE-1));
+   
+    uchar v[4];
+    int i;
+#pragma unroll 4
+    for(i=0; i<4;  i++)
+       v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : 0;
+
+    short itab[4];
+    float tab1y[2], tab1x[2];
+    tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
+    tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
+    tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
+    tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
+    
+#pragma unroll 4
+    for(i=0; i<4;  i++)
+    {
+        float v = tab1y[(i>>1)] * tab1x[(i&1)];
+        itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE ));
+    }
+    if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        int sum = 0;
+        for ( i =0; i<4; i++ )
+        {
+            sum += v[i] * itab[i] ;
+        }
+        dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+ 	
+	F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx = (short)(X >> INTER_BITS) - 1;
+    short sy = (short)(Y >> INTER_BITS) - 1;
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+ 
+ 	uchar v[16];
+    int i, j;
+   
+#pragma unroll 4
+    for(i=0; i<4;  i++)
+    for(j=0; j<4;  j++)
+    {
+        v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
+    }
+
+    short itab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = 1.f/INTER_TAB_SIZE * ay;
+    axx = 1.f/INTER_TAB_SIZE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+    
+    int isum = 0;
+#pragma unroll 16
+    for( i=0; i<16; i++ )
+    {
+        F v = tab1y[(i>>2)] * tab1x[(i&3)];
+        isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
+    }
+    if( isum != INTER_REMAP_COEF_SCALE )
+    {
+        int k1, k2;
+        int diff = isum - INTER_REMAP_COEF_SCALE;
+        int Mk1=2, Mk2=2, mk1=2, mk2=2;
+        for( k1 = 2; k1 < 4; k1++ )
+            for( k2 = 2; k2 < 4; k2++ )
+            {
+                if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                    mk1 = k1, mk2 = k2;
+                else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                     Mk1 = k1, Mk2 = k2;
+            }
+        diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+    }
+
+
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        int sum=0;
+        for ( i =0; i<16; i++ )
+        {
+            sum += v[i] * itab[i] ;
+        }
+        dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+/**********************************************8UC4*********************************************
+***********************************************************************************************/
+
+__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, 
+							int dstStep, int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+  
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? 1./W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    short sx = (short)X;
+    short sy = (short)Y;
+ 
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0; 
+}
+
+__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
+						   	int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+							int dstStep, int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    src_offset = (src_offset>>2);
+    srcStep = (srcStep>>2); 
+ 
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx = (short)(X >> INTER_BITS);
+    short sy = (short)(Y >> INTER_BITS);
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+    
+    
+    int4 v0, v1, v2, v3;
+
+    v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : 0;
+    v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : 0;
+    v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : 0;
+    v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : 0;
+
+    int itab0, itab1, itab2, itab3;
+    float taby, tabx;
+    taby = 1.f/INTER_TAB_SIZE*ay;
+    tabx = 1.f/INTER_TAB_SIZE*ax;
+    
+    itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+    itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
+    itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+    itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
+    
+    int4 val;
+    val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
+        
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+}
+
+__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, 
+							int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+							int dstStep, int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    src_offset = (src_offset>>2);
+    srcStep = (srcStep>>2); 
+    dst_offset = (dst_offset>>2);
+    dstStep = (dstStep>>2); 
+    
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx = (short)(X >> INTER_BITS) - 1;
+    short sy = (short)(Y >> INTER_BITS) - 1;
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+    
+    uchar4 v[16];
+    int i,j;
+#pragma unroll 4
+    for(i=0; i<4; i++)
+    for(j=0; j<4; j++)
+    {
+        v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)])  : (uchar4)0;
+    }
+    int itab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = INTER_SCALE * ay;
+    axx = INTER_SCALE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+    int isum = 0;
+    
+#pragma unroll 16
+    for( i=0; i<16; i++ )
+    {
+        float tmp;
+        tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
+        itab[i] = rint(tmp);
+        isum += itab[i];
+    }
+
+    if( isum != INTER_REMAP_COEF_SCALE )
+    {
+        int k1, k2;
+        int diff = isum - INTER_REMAP_COEF_SCALE;
+        int Mk1=2, Mk2=2, mk1=2, mk2=2;
+        
+           for( k1 = 2; k1 < 4; k1++ )
+            for( k2 = 2; k2 < 4; k2++ )
+            {
+                
+                if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                    mk1 = k1, mk2 = k2;
+                else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                     Mk1 = k1, Mk2 = k2;
+            }
+            
+        diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+    }
+
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        int4 sum=0;
+        for ( i =0; i<16; i++ )
+        {
+            sum += convert_int4(v[i]) * itab[i];
+        }
+        dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+
+/**********************************************32FC1********************************************
+***********************************************************************************************/
+
+__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+     
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? 1./W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    short sx = (short)X;
+    short sy = (short)Y;
+
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0; 
+}
+
+__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    src_offset = (src_offset>>2);
+     
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx = (short)(X >> INTER_BITS);
+    short sy = (short)(Y >> INTER_BITS);
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+    
+    float v0, v1, v2, v3;
+
+    v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : 0;
+    v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : 0;
+    v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : 0;
+    v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : 0;
+
+    float tab[4];
+    float taby[2], tabx[2];
+    taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
+    taby[1] = 1.f/INTER_TAB_SIZE*ay;
+    tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
+    tabx[1] = 1.f/INTER_TAB_SIZE*ax;
+   
+    tab[0] = taby[0] * tabx[0];
+    tab[1] = taby[0] * tabx[1];
+    tab[2] = taby[1] * tabx[0];
+    tab[3] = taby[1] * tabx[1];
+
+    float sum = 0;
+    sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
+}
+    
+__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    src_offset = (src_offset>>2);
+    dst_offset = (dst_offset>>2);
+     
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx = (short)(X >> INTER_BITS) - 1;
+    short sy = (short)(Y >> INTER_BITS) - 1;
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+    float v[16];
+    int i;
+
+    for(i=0; i<16;  i++)
+        v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
+
+    float tab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = 1.f/INTER_TAB_SIZE * ay;
+    axx = 1.f/INTER_TAB_SIZE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+    for( i=0; i<16; i++ )
+    {
+        tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+    }
+    
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        float sum = 0;
+#pragma unroll 4
+        for ( i =0; i<16; i++ )
+        {
+            sum += v[i] * tab[i];
+        }
+        dst[dst_offset+dy*dstStep+dx] = sum;
+
+    }
+}
+
+
+/**********************************************32FC4********************************************
+***********************************************************************************************/
+
+__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+        
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W =(W != 0.0)? 1./W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    short sx = (short)X;
+    short sy = (short)Y;
+    
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : 0; 
+}
+
+__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                            int dst_cols, int dst_rows, int srcStep, int dstStep, 
+                            int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    src_offset = (src_offset>>4);
+    dst_offset = (dst_offset>>4);
+    srcStep = (srcStep>>2);
+    dstStep = (dstStep>>2);
+        
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx0 = (short)(X >> INTER_BITS);
+    short sy0 = (short)(Y >> INTER_BITS);
+    short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax0 = (short)(X & (INTER_TAB_SIZE-1));
+ 
+    
+    float4 v0, v1, v2, v3;
+
+    v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
+    v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
+    v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
+    v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
+
+    float tab[4];
+    float taby[2], tabx[2];
+    taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+    taby[1] = 1.f/INTER_TAB_SIZE*ay0;
+    tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+    tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
+   
+    tab[0] = taby[0] * tabx[0];
+    tab[1] = taby[0] * tabx[1];
+    tab[2] = taby[1] * tabx[0];
+    tab[3] = taby[1] * tabx[1];
+
+    float4 sum = 0;
+    sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3]; 
+    if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        dst[dst_offset+dy*dstStep+dx] = sum;
+}
+    
+__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, 
+                            int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+							int dstStep, int src_offset, int dst_offset,  __constant F * M )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+    
+    src_offset = (src_offset>>4);
+    dst_offset = (dst_offset>>4);
+    srcStep = (srcStep>>2);
+    dstStep = (dstStep>>2);
+        
+    F X0 = M[0]*dx + M[1]*dy + M[2];
+    F Y0 = M[3]*dx + M[4]*dy + M[5];
+    F W = M[6]*dx + M[7]*dy + M[8];
+    W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+    int X = rint(X0*W);
+    int Y = rint(Y0*W);
+    
+    short sx = (short)(X >> INTER_BITS);
+    short sy = (short)(Y >> INTER_BITS);
+    short ay = (short)(Y & (INTER_TAB_SIZE-1));
+    short ax = (short)(X & (INTER_TAB_SIZE-1));
+ 
+    
+    float4 v[16];
+    int i;
+
+    for(i=0; i<16;  i++)
+        v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
+
+    float tab[16];
+    float tab1y[4], tab1x[4];
+    float axx, ayy;
+
+    ayy = 1.f/INTER_TAB_SIZE * ay;
+    axx = 1.f/INTER_TAB_SIZE * ax;
+    interpolateCubic(ayy, tab1y);
+    interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+    for( i=0; i<16; i++ )
+    {
+        tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+    }
+    
+    if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+    {
+        float4 sum = 0;
+#pragma unroll 4
+        for ( i =0; i<16; i++ )
+        {
+            sum += v[i] * tab[i];
+        }
+        dst[dst_offset+dy*dstStep+dx] = sum;
+
+    }
+}
+
--- a/modules/ocl/src/kernels/meanShift.cl
+++ b/modules/ocl/src/kernels/meanShift.cl
@ -0,0 +1,248 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//    Xu Pang, pangxu010@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+typedef double F;
+#else
+typedef float F;
+#endif
+
+short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step, 
+               __global uchar4* in, int in_step, int dst_off, int src_off, 
+               int cols, int rows, int sp, int sr, int maxIter, float eps)
+{
+    int isr2 = sr*sr;
+    in_step = in_step >> 2;
+    out_step = out_step >> 2;
+    src_off = src_off >> 2;
+    dst_off = dst_off >> 2;
+    int idx = src_off + y0 * in_step + x0;
+//    uchar4 c = vload4(0, (__global uchar*)in+idx);
+    uchar4 c = in[idx];
+    int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ;
+
+    // iterate meanshift procedure
+    for( int iter = 0; iter < maxIter; iter++ )
+    {
+        int count = 0;
+        int4 s = (int4)0;
+        int sx = 0, sy = 0;
+
+        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+        //deal with the image boundary
+        int minx = (x0-sp)>0 ? x0-sp : 0;
+        int miny = (y0-sp)>0 ? y0-sp : 0;
+        int maxx = (x0+sp)<cols ? x0+sp : cols-1;
+        int maxy = (y0+sp)<rows ? y0+sp : rows-1;
+
+        for( int y = miny; y <= maxy; y++)
+        {
+            int rowCount = 0;
+            int x = minx; 
+            for( ; x+3 <= maxx; x+=4 )
+            {                    
+                int id = src_off + y*in_step + x;
+                uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
+                int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
+                              (t.s2 - c.z) * (t.s2 - c.z);
+                int norm2_2 = (t.s4 - c.x) * (t.s4 - c.x) + (t.s5 - c.y) * (t.s5 - c.y) +
+                              (t.s6 - c.z) * (t.s6 - c.z);
+                int norm2_3 = (t.s8 - c.x) * (t.s8 - c.x) + (t.s9 - c.y) * (t.s9 - c.y) +
+                              (t.sa - c.z) * (t.sa - c.z);
+                int norm2_4 = (t.sc - c.x) * (t.sc - c.x) + (t.sd - c.y) * (t.sd - c.y) +
+                              (t.se - c.z) * (t.se - c.z);
+                if( norm2_1 <= isr2 )
+                {
+                    s.x += t.s0; s.y += t.s1; s.z += t.s2;
+                    sx += x; rowCount++;
+                }
+                if( norm2_2 <= isr2 )
+                {
+                    s.x += t.s4; s.y += t.s5; s.z += t.s6;
+                    sx += x+1; rowCount++;
+                }
+                if( norm2_3 <= isr2 )
+                {
+                    s.x += t.s8; s.y += t.s9; s.z += t.sa;
+                    sx += x+2; rowCount++;
+                }
+                if( norm2_4 <= isr2 )
+                {
+                    s.x += t.sc; s.y += t.sd; s.z += t.se;
+                    sx += x+3; rowCount++;
+                }
+            }
+            if(x == maxx)
+            {
+                int id = src_off + y*in_step + x;
+                uchar4 t = in[id];
+                int norm2 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
+                            (t.s2 - c.z) * (t.s2 - c.z);
+                if( norm2 <= isr2 )
+                {
+                    s.x += t.s0; s.y += t.s1; s.z += t.s2;
+                    sx += x; rowCount++;
+                }
+                
+            }
+            if(x+1 == maxx)
+            {
+                  int id = src_off + y*in_step + x;
+                  uchar8 t = (uchar8)(in[id],in[id+1]);
+                  int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
+                                (t.s2 - c.z) * (t.s2 - c.z);
+                  int norm2_2 = (t.s4 - c.x) * (t.s4 - c.x) + (t.s5 - c.y) * (t.s5 - c.y) +
+                                (t.s6 - c.z) * (t.s6 - c.z);
+                  if( norm2_1 <= isr2 )
+                  {
+                      s.x += t.s0; s.y += t.s1; s.z += t.s2;
+                      sx += x; rowCount++;
+                  }
+                  if( norm2_2 <= isr2 )
+                  {
+                      s.x += t.s4; s.y += t.s5; s.z += t.s6;
+                      sx += x+1; rowCount++;
+                  }
+            }
+            if(x+2 == maxx)
+            {
+                  int id = src_off + y*in_step + x;
+                  uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
+                  int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
+                                (t.s2 - c.z) * (t.s2 - c.z);
+                  int norm2_2 = (t.s4 - c.x) * (t.s4 - c.x) + (t.s5 - c.y) * (t.s5 - c.y) +
+                                (t.s6 - c.z) * (t.s6 - c.z);
+                  int norm2_3 = (t.s8 - c.x) * (t.s8 - c.x) + (t.s9 - c.y) * (t.s9 - c.y) +
+                                (t.sa - c.z) * (t.sa - c.z);
+                  if( norm2_1 <= isr2 )
+                  {
+                      s.x += t.s0; s.y += t.s1; s.z += t.s2;
+                      sx += x; rowCount++;
+                  }
+                  if( norm2_2 <= isr2 )
+                  {
+                      s.x += t.s4; s.y += t.s5; s.z += t.s6;
+                      sx += x+1; rowCount++;
+                  }
+                  if( norm2_3 <= isr2 )
+                  {
+                      s.x += t.s8; s.y += t.s9; s.z += t.sa;
+                      sx += x+2; rowCount++;
+                  }
+            }
+            if(rowCount == 0)
+               continue;
+            count += rowCount;
+            if(y == 0)
+               continue;
+            sy += y*rowCount;
+        }
+
+        if( count == 0 )
+            break;
+
+        F  icount = 1.0/count;
+        int x1 = convert_int_rtz(sx*icount);
+        int y1 = convert_int_rtz(sy*icount);
+        s.x = convert_int_rtz(s.x*icount);
+        s.y = convert_int_rtz(s.y*icount);
+        s.z = convert_int_rtz(s.z*icount);
+
+        int4 tmp = s - convert_int4(c);
+        int norm2 = tmp.x * tmp.x + tmp.y *  tmp.y +
+                    tmp.z * tmp.z;
+
+        bool stopFlag = (x1 == x0 && y1 == y0) || (abs(x1-x0) + abs(y1-y0) + norm2 <= eps);
+
+        x0 = x1;
+        y0 = y1;
+        c.x = s.x;
+        c.y = s.y;
+        c.z = s.z;
+
+        if( stopFlag )
+            break;
+    }
+
+    out[base] = c;
+
+    return (short2)((short)x0, (short)y0);
+}
+
+
+__kernel void meanshift_kernel(__global uchar4* out, int out_step, 
+                               __global uchar4* in, int in_step, 
+                        int dst_off, int src_off, int cols, int rows,
+                        int sp, int sr, int maxIter, float eps)
+{
+    int x0 = get_global_id(0); 
+    int y0 = get_global_id(1); 
+    if( x0 < cols && y0 < rows )
+        do_mean_shift(x0, y0, out, out_step, in, in_step, dst_off, src_off,
+                          cols, rows, sp, sr, maxIter, eps);
+}
+
+__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr, 
+                             __global short2* outsp, int instep, int outrstep, 
+                             int outspstep, int in_off, int outr_off, int outsp_off,
+                             int cols, int rows, int sp, int sr, int maxIter, float eps )
+{
+    int x0 = get_global_id(0); 
+    int y0 = get_global_id(1); 
+
+    if( x0 < cols && y0 < rows )
+    {
+        //int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
+        //*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+        // we have ensured before that ((outspstep & 0x11)==0).
+        outsp_off >>= 2; 
+        outspstep >>= 2;
+        int basesp = outsp_off + y0 * outspstep + x0;
+        outsp[basesp] = do_mean_shift(x0, y0, outr, outrstep, in, instep, outr_off, in_off, cols, rows, sp, sr, maxIter, eps);
+//        outsp[basesp] =(short2)((short)x0,(short)y0);
+    }
+}
+
--- a/modules/ocl/src/kernels/merge_mat.cl
+++ b/modules/ocl/src/kernels/merge_mat.cl
--- a/modules/ocl/src/kernels/operator_convertTo.cl
+++ b/modules/ocl/src/kernels/operator_convertTo.cl
@ -0,0 +1,335 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+#define F float
+
+__kernel void convert_to_S4_C1_D0(
+		__global const int* restrict srcMat,
+		__global uchar* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0)<<2;
+		int y=get_global_id(1);
+		//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
+		//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
+		int off_src = (dstoffset_in_pixel & 3);	
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);		
+		int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
+		int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
+
+		if ( (x < cols + off_src) & (y < rows) )
+		{
+			float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));				
+			uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);		
+			//int trans_src[10] = {temp_src1.y,temp_src1.z,temp_src1.w,temp_src.x,temp_src.y,temp_src.z,temp_src.w,temp_src2.x,temp_src2.y,temp_src2.z};		
+			temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end) ? convert_uchar_sat(temp_src.x*alpha+beta) : temp_dst.x;
+			temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end) ? convert_uchar_sat(temp_src.y*alpha+beta) : temp_dst.y;
+			temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end) ? convert_uchar_sat(temp_src.z*alpha+beta) : temp_dst.z;
+			temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end) ? convert_uchar_sat(temp_src.w*alpha+beta) : temp_dst.w;
+			*(__global uchar4*)(dstMat+dstidx) = temp_dst;
+		}
+}
+
+__kernel void convert_to_S4_C4_D0(
+		__global const int4* restrict srcMat,
+		__global uchar4* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float4 temp_src = convert_float4(srcMat[srcidx]);
+			dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
+		}
+}
+
+__kernel void convert_to_S5_C1_D0(
+		__global const float* restrict srcMat,
+		__global uchar* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0)<<2;
+		int y=get_global_id(1);
+		//int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
+		//int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
+		int off_src = (dstoffset_in_pixel & 3);	
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);		
+		int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
+		int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
+
+		if ( (x < cols + off_src) & (y < rows) )
+		{
+			float4 temp_src = vload4(0,srcMat+srcidx);				
+			uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);		
+			//int trans_src[10] = {temp_src1.y,temp_src1.z,temp_src1.w,temp_src.x,temp_src.y,temp_src.z,temp_src.w,temp_src2.x,temp_src2.y,temp_src2.z};		
+			temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end) ? convert_uchar_sat(temp_src.x*alpha+beta) : temp_dst.x;
+			temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end) ? convert_uchar_sat(temp_src.y*alpha+beta) : temp_dst.y;
+			temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end) ? convert_uchar_sat(temp_src.z*alpha+beta) : temp_dst.z;
+			temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end) ? convert_uchar_sat(temp_src.w*alpha+beta) : temp_dst.w;
+			*(__global uchar4*)(dstMat+dstidx) = temp_dst;
+		}
+}
+__kernel void convert_to_S5_C4_D0(
+		__global const float4* restrict srcMat,
+		__global uchar4* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float4 temp_src = srcMat[srcidx];
+			dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
+		}
+}
+
+__kernel void convert_to_S0_C1_D4(
+		__global const uchar* restrict srcMat,
+		__global int* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float temp_src = convert_float(srcMat[srcidx]);
+			dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
+		}
+}
+
+__kernel void convert_to_S5_C1_D4(
+		__global const float* restrict srcMat,
+		__global int* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float temp_src = srcMat[srcidx];
+			dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
+		}
+}
+
+__kernel void convert_to_S0_C4_D4(
+		__global const uchar4* restrict srcMat,
+		__global int4* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float4 temp_src = convert_float4(srcMat[srcidx]);
+			dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
+		}
+}
+
+__kernel void convert_to_S5_C4_D4(
+		__global const float4* restrict srcMat,
+		__global int4* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float4 temp_src = srcMat[srcidx];
+			dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
+		}
+}
+
+__kernel void convert_to_S0_C1_D5(
+		__global const uchar* restrict srcMat,
+		__global float* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float temp_src = convert_float(srcMat[srcidx]);
+			dstMat[dstidx] = temp_src*alpha+beta;
+		}
+}
+
+__kernel void convert_to_S4_C1_D5(
+		__global const int* restrict srcMat,
+		__global float* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float temp_src = convert_float(srcMat[srcidx]);
+			dstMat[dstidx] = temp_src*alpha+beta;
+		}
+}
+
+__kernel void convert_to_S0_C4_D5(
+		__global const uchar4* restrict srcMat,
+		__global float4* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float4 temp_src = convert_float4(srcMat[srcidx]);
+			dstMat[dstidx] = temp_src*alpha+beta;
+		}
+}
+
+__kernel void convert_to_S4_C4_D5(
+		__global const int4* restrict srcMat,
+		__global float4* dstMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		F alpha,
+		F beta)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		if ( (x < cols) & (y < rows) )
+		{	
+			float4 temp_src = convert_float4(srcMat[srcidx]);
+			dstMat[dstidx] = temp_src*alpha+beta;
+		}
+}
--- a/modules/ocl/src/kernels/operator_copyToM.cl
+++ b/modules/ocl/src/kernels/operator_copyToM.cl
@ -0,0 +1,209 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+__kernel void copy_to_with_mask_C1_D0(
+		__global const uchar* restrict srcMat,
+		__global uchar* dstMat,
+		__global const uchar* restrict maskMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0)<<2;
+		int y=get_global_id(1);
+
+    int dst_addr_start = mad24((uint)y, (uint)dstStep_in_pixel, (uint)dstoffset_in_pixel); 
+		int dst_addr_end = mad24((uint)y, (uint)dstStep_in_pixel, (uint)cols+dstoffset_in_pixel);
+		int dstidx = mad24((uint)y, (uint)dstStep_in_pixel, (uint)x+ dstoffset_in_pixel) & (int)0xfffffffc;
+
+    int vector_off = dstoffset_in_pixel & 3; 
+
+		int srcidx = mad24((uint)y, (uint)srcStep_in_pixel, (uint)x + srcoffset_in_pixel - vector_off);		
+
+    int mask_addr_start = mad24((uint)y, (uint)maskStep, (uint)maskoffset);
+		int mask_addr_end = mad24((uint)y, (uint)maskStep, (uint)cols+maskoffset);
+		int maskidx = mad24((uint)y, (uint)maskStep, (uint)x + maskoffset - vector_off);
+
+		if ( (x < cols + dstoffset_in_pixel) & (y < rows) )
+		{
+        uchar4 src_data  = vload4(0, srcMat + srcidx);
+        uchar4 mask_data = vload4(0, maskMat + maskidx);
+        uchar4 dst_data  = *((__global uchar4 *)(dstMat + dstidx));
+        uchar4 tmp_data;
+
+        mask_data.x = ((maskidx + 0 >= mask_addr_start) && (maskidx + 0 < mask_addr_end)) ? mask_data.x : 0;
+        mask_data.y = ((maskidx + 1 >= mask_addr_start) && (maskidx + 1 < mask_addr_end)) ? mask_data.y : 0;
+        mask_data.z = ((maskidx + 2 >= mask_addr_start) && (maskidx + 2 < mask_addr_end)) ? mask_data.z : 0;
+        mask_data.w = ((maskidx + 3 >= mask_addr_start) && (maskidx + 3 < mask_addr_end)) ? mask_data.w : 0;
+			
+        tmp_data.x = ((dstidx + 0 >= dst_addr_start) && (dstidx + 0 < dst_addr_end) && (mask_data.x)) 
+                     ? src_data.x : dst_data.x;
+        tmp_data.y = ((dstidx + 1 >= dst_addr_start) && (dstidx + 1 < dst_addr_end) && (mask_data.y)) 
+                     ? src_data.y : dst_data.y;
+        tmp_data.z = ((dstidx + 2 >= dst_addr_start) && (dstidx + 2 < dst_addr_end) && (mask_data.z)) 
+                     ? src_data.z : dst_data.z;
+        tmp_data.w = ((dstidx + 3 >= dst_addr_start) && (dstidx + 3 < dst_addr_end) && (mask_data.w)) 
+                     ? src_data.w : dst_data.w;
+
+        (*(__global uchar4*)(dstMat+dstidx)) = tmp_data;
+		}
+}
+
+__kernel void copy_to_with_mask_C4_D0(
+		__global const uchar4* restrict srcMat,
+		__global uchar4* dstMat,
+		__global const uchar* restrict maskMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = srcMat[srcidx];
+		}
+}
+__kernel void copy_to_with_mask_C1_D4(
+		__global const int* restrict srcMat,
+		__global int* dstMat,
+		__global const uchar* restrict maskMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = srcMat[srcidx];
+		}
+}
+__kernel void copy_to_with_mask_C4_D4(
+		__global const int4* restrict srcMat,
+		__global int4* dstMat,
+		__global const uchar* restrict maskMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = srcMat[srcidx];
+		}
+}
+__kernel void copy_to_with_mask_C1_D5(
+		__global const float* restrict srcMat,
+		__global float* dstMat,
+		__global const uchar* restrict maskMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = srcMat[srcidx];
+		}
+}
+__kernel void copy_to_with_mask_C4_D5(
+		__global const float4* restrict srcMat,
+		__global float4* dstMat,
+		__global const uchar* restrict maskMat,
+		int cols,
+		int rows,
+		int srcStep_in_pixel,
+		int srcoffset_in_pixel, 		
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);		
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = srcMat[srcidx];
+		}
+}
--- a/modules/ocl/src/kernels/operator_setTo.cl
+++ b/modules/ocl/src/kernels/operator_setTo.cl
@ -0,0 +1,124 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+/*
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+*/
+
+__kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat,
+        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+{
+		int x=get_global_id(0)<<2;
+		int y=get_global_id(1);
+		int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
+		int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
+		int idx = mad24(y,dstStep_in_pixel,(int)(x+ offset_in_pixel & (int)0xfffffffc));
+		uchar4 out;
+		out.x = out.y = out.z = out.w = convert_uchar_sat(scalar.x);
+		if ( (idx>=addr_start)&(idx+3 < addr_end) & (y < rows))
+		{
+			*(__global uchar4*)(dstMat+idx) = out;
+		}
+		else if(y < rows)
+		{
+			uchar4 temp = *(__global uchar4*)(dstMat+idx);
+			temp.x = (idx>=addr_start)&(idx < addr_end)? out.x : temp.x;
+			temp.y = (idx+1>=addr_start)&(idx+1 < addr_end)? out.y : temp.y;
+			temp.z = (idx+2>=addr_start)&(idx+2 < addr_end)? out.z : temp.z;
+			temp.w = (idx+3>=addr_start)&(idx+3 < addr_end)? out.w : temp.w;
+			*(__global uchar4*)(dstMat+idx) = temp;
+		}
+}
+
+__kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat,
+        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		if ( (x < cols) & (y < rows))
+		{
+		    int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
+			dstMat[idx] = convert_uchar4_sat(scalar);
+		}
+}
+__kernel void set_to_without_mask_C1_D4(float4 scalar,__global int * dstMat,
+        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		if ( (x < cols) & (y < rows))
+		{
+		    int idx = mad24(y, dstStep_in_pixel, x+offset_in_pixel);
+			dstMat[idx] = convert_int_sat(scalar.x);
+		}
+}
+__kernel void set_to_without_mask_C4_D4(float4 scalar,__global int4 * dstMat,
+        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		if ( (x < cols) & (y < rows))
+		{
+		    int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
+			dstMat[idx] = convert_int4_sat(scalar);
+		}
+}
+
+__kernel void set_to_without_mask_C1_D5(float4 scalar,__global float * dstMat,
+        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		if ( (x < cols) & (y < rows))
+		{
+		    int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
+			dstMat[idx] = scalar.x;
+		}
+}
+__kernel void set_to_without_mask_C4_D5(float4 scalar,__global float4 * dstMat,
+        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		if ( (x < cols) & (y < rows))
+		{
+		    int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
+			dstMat[idx] = scalar;
+		}
+}
+
--- a/modules/ocl/src/kernels/operator_setToM.cl
+++ b/modules/ocl/src/kernels/operator_setToM.cl
@ -0,0 +1,227 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+/*#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+*/
+/*
+__kernel void set_to_with_mask_C1_D0(
+		float4 scalar,
+		__global uchar* dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = convert_uchar_sat(scalar.x);
+		}
+
+}
+*/
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+__kernel void set_to_with_mask_C1_D0(
+		float4 scalar,
+		__global uchar* dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * restrict maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0)<<2;
+		int y=get_global_id(1);
+		int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
+		int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
+		int mask_addr_start = mad24(y,maskStep,maskoffset);
+		int mask_addr_end = mad24(y,maskStep,cols+maskoffset);
+		int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc);
+		uchar out = convert_uchar_sat(scalar.x);	
+		int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3;	
+		
+		if ( (x < cols) & (y < rows) )
+		{
+			uchar4 temp_dst = *(__global uchar4*)(dstMat+dstidx);
+			uchar4 temp_mask1 = *(__global uchar4*)(maskMat+maskidx-4);
+			uchar4 temp_mask = *(__global uchar4*)(maskMat+maskidx);
+			uchar4 temp_mask2 = *(__global uchar4*)(maskMat+maskidx+4);		
+			temp_mask1.x = (maskidx-4 >=mask_addr_start)&(maskidx-4 < mask_addr_end) ? temp_mask1.x : 0;
+			temp_mask1.y = (maskidx-3 >=mask_addr_start)&(maskidx-3 < mask_addr_end) ? temp_mask1.y : 0;
+			temp_mask1.z = (maskidx-2 >=mask_addr_start)&(maskidx-2 < mask_addr_end) ? temp_mask1.z : 0;
+			temp_mask1.w = (maskidx-1 >=mask_addr_start)&(maskidx-1 < mask_addr_end) ? temp_mask1.w : 0;			
+			temp_mask.x = (maskidx >=mask_addr_start)&(maskidx < mask_addr_end) ? temp_mask.x : 0;
+			temp_mask.y = (maskidx+1 >=mask_addr_start)&(maskidx+1 < mask_addr_end) ? temp_mask.y : 0;
+			temp_mask.z = (maskidx+2 >=mask_addr_start)&(maskidx+2 < mask_addr_end) ? temp_mask.z : 0;
+			temp_mask.w = (maskidx+3 >=mask_addr_start)&(maskidx+3 < mask_addr_end) ? temp_mask.w : 0;	
+			temp_mask2.x = (maskidx+4 >=mask_addr_start)&(maskidx+4 < mask_addr_end) ? temp_mask2.x : 0;
+			temp_mask2.y = (maskidx+5 >=mask_addr_start)&(maskidx+5 < mask_addr_end) ? temp_mask2.y : 0;
+			temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0;
+			temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0;	
+			uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z};				
+			temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? out : temp_dst.x;
+			temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? out : temp_dst.y;
+			temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? out : temp_dst.z;
+			temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? out : temp_dst.w;
+			*(__global uchar4*)(dstMat+dstidx) = temp_dst;
+		}
+}
+__kernel void set_to_with_mask_C4_D0(
+		float4 scalar,
+		__global uchar4 * dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * restrict maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = convert_uchar4_sat(scalar);
+		}
+
+}
+__kernel void set_to_with_mask_C1_D4(
+		float4 scalar,
+		__global int * dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * restrict maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = convert_int_sat(scalar.x);
+		}
+
+}
+__kernel void set_to_with_mask_C4_D4(
+		float4 scalar,
+		__global int4 * dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * restrict maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = convert_int4_sat(scalar);
+		}
+
+}
+__kernel void set_to_with_mask_C1_D5(
+		float4 scalar,
+		__global float * dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * restrict maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = scalar.x;
+		}
+
+}
+__kernel void set_to_with_mask_C4_D5(
+		float4 scalar,
+		__global float4 * dstMat,
+		int cols,
+		int rows,
+		int dstStep_in_pixel,
+		int dstoffset_in_pixel, 		
+        __global const uchar * restrict maskMat,
+		int maskStep,
+		int maskoffset)
+{
+		int x=get_global_id(0);
+		int y=get_global_id(1);
+		int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+		int maskidx = mad24(y,maskStep,x+ maskoffset);
+		uchar mask = maskMat[maskidx];		
+		if ( (x < cols) & (y < rows) & mask)
+		{
+			dstMat[dstidx] = scalar;
+		}
+
+}
+
--- a/modules/ocl/src/kernels/split_mat.cl
+++ b/modules/ocl/src/kernels/split_mat.cl
--- a/modules/ocl/src/kernels/stereobm.cl
+++ b/modules/ocl/src/kernels/stereobm.cl
@ -0,0 +1,427 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define ROWSperTHREAD 21     // the number of rows a thread will process
+#define BLOCK_W       128    // the thread block width (464)
+#define N_DISPARITIES 8
+
+#define STEREO_MIND 0                    // The minimum d range to check
+#define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
+
+int SQ(int a)
+{
+    return a * a;
+}
+
+unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, 
+                     volatile __local unsigned int *col_ssd, int radius)
+{	
+    unsigned int cache = 0;
+    unsigned int cache2 = 0;
+
+    for(int i = 1; i <= radius; i++)
+        cache += col_ssd[i];
+
+    col_ssd_cache[0] = cache;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < BLOCK_W - radius)
+        cache2 = col_ssd_cache[radius];
+    else
+        for(int i = radius + 1; i < (2 * radius + 1); i++)
+            cache2 += col_ssd[i];
+
+    return col_ssd[0] + cache + cache2;
+}
+
+uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, 
+             volatile __local unsigned int *col_ssd, int radius)
+{
+    unsigned int ssd[N_DISPARITIES];
+
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
+    ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
+
+    int bestIdx = 0;
+    for (int i = 0; i < N_DISPARITIES; i++)
+    {
+        if (mssd == ssd[i])
+            bestIdx = i;
+    }
+
+    return (uint2)(mssd, bestIdx);
+}
+
+void StepDown(int idx1, int idx2, __global unsigned char* imageL, 
+              __global unsigned char* imageR, int d, volatile  __local unsigned int *col_ssd, int radius)
+{
+    unsigned char leftPixel1;
+    unsigned char leftPixel2;
+    unsigned char rightPixel1[8];
+    unsigned char rightPixel2[8];
+    unsigned int diff1, diff2;
+
+    leftPixel1 = imageL[idx1];
+    leftPixel2 = imageL[idx2];
+
+    idx1 = idx1 - d;
+    idx2 = idx2 - d;
+
+    rightPixel1[7] = imageR[idx1 - 7];
+    rightPixel1[0] = imageR[idx1 - 0];
+    rightPixel1[1] = imageR[idx1 - 1];
+    rightPixel1[2] = imageR[idx1 - 2];
+    rightPixel1[3] = imageR[idx1 - 3];
+    rightPixel1[4] = imageR[idx1 - 4];
+    rightPixel1[5] = imageR[idx1 - 5];
+    rightPixel1[6] = imageR[idx1 - 6];
+
+    rightPixel2[7] = imageR[idx2 - 7];
+    rightPixel2[0] = imageR[idx2 - 0];
+    rightPixel2[1] = imageR[idx2 - 1];
+    rightPixel2[2] = imageR[idx2 - 2];
+    rightPixel2[3] = imageR[idx2 - 3];
+    rightPixel2[4] = imageR[idx2 - 4];
+    rightPixel2[5] = imageR[idx2 - 5];
+    rightPixel2[6] = imageR[idx2 - 6];
+
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
+    diff1 = leftPixel1 - rightPixel1[0];
+    diff2 = leftPixel2 - rightPixel2[0];
+    col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[1];
+    diff2 = leftPixel2 - rightPixel2[1];
+    col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[2];
+    diff2 = leftPixel2 - rightPixel2[2];
+    col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[3];
+    diff2 = leftPixel2 - rightPixel2[3];
+    col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[4];
+    diff2 = leftPixel2 - rightPixel2[4];
+    col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[5];
+    diff2 = leftPixel2 - rightPixel2[5];
+    col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[6];
+    diff2 = leftPixel2 - rightPixel2[6];
+    col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[7];
+    diff2 = leftPixel2 - rightPixel2[7];
+    col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+}
+
+void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, 
+                __global unsigned char* imageR, int d, 
+                volatile __local unsigned int *col_ssd, int radius)
+{
+    unsigned char leftPixel1;
+    int idx;
+    unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    for(int i = 0; i < (2 * radius + 1); i++)
+    {
+        idx = y_tex * im_pitch + x_tex;
+        leftPixel1 = imageL[idx];
+        idx = idx - d;
+
+        diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
+        diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
+        diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
+        diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
+        diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
+        diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
+        diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
+        diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
+
+        y_tex += 1;
+    }
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
+    col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0];
+    col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1];
+    col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2];
+    col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3];
+    col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4];
+    col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5];
+    col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6];
+    col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
+}
+
+__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,  
+                           __global unsigned int *cminSSDImage, int cminSSD_step,
+                           __global unsigned char *disp, int disp_step,int cwidth, int cheight,
+                           int img_step, int maxdisp, int radius,  
+                           __local unsigned int *col_ssd_cache)
+{
+
+    volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
+    volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;  
+
+    int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
+   // int Y = get_group_id(1) * ROWSperTHREAD + radius;
+
+    #define Y (get_group_id(1) * ROWSperTHREAD + radius)
+
+    volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+    __global unsigned char* disparImage = disp + X + Y * disp_step;
+
+    int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
+    int y_tex;
+    int x_tex = X - radius;
+
+    if (x_tex >= cwidth)
+        return;
+
+    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
+    {
+        y_tex = Y - radius;
+
+        InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius);
+        if (col_ssd_extra > 0)
+            if (x_tex + BLOCK_W < cwidth)
+                InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius);
+
+        barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
+
+        if (X < cwidth - radius && Y < cheight - radius)
+        {
+            uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
+            if (minSSD.x < minSSDImage[0])
+            {
+                disparImage[0] = (unsigned char)(d + minSSD.y);
+                minSSDImage[0] = minSSD.x;
+            }
+        }
+
+        for(int row = 1; row < end_row; row++)
+        {
+            int idx1 = y_tex * img_step + x_tex;
+            int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
+
+            barrier(CLK_GLOBAL_MEM_FENCE); 
+            barrier(CLK_LOCAL_MEM_FENCE); 
+
+            StepDown(idx1, idx2, left, right, d, col_ssd, radius);
+            if (col_ssd_extra > 0)
+                if (x_tex + BLOCK_W < cwidth)
+                    StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius);
+
+            y_tex += 1;
+
+            barrier(CLK_LOCAL_MEM_FENCE); 
+
+            if (X < cwidth - radius && row < cheight - radius - Y)
+            {
+                int idx = row * cminSSD_step;
+                uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
+                if (minSSD.x < minSSDImage[idx])
+                {
+                    disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
+                    minSSDImage[idx] = minSSD.x;
+                }
+            }
+        } // for row loop
+    } // for d loop
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output, 
+                               int rows, int cols, int prefilterCap)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) + 
+                  input[(y)   * cols + (x-1)] * (-2) + input[(y)   * cols + (x+1)] * (2) +
+                  input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
+
+        cov = min(min(max(-prefilterCap, cov), prefilterCap) + prefilterCap, 255);
+        output[y * cols + x] = cov & 0xFF;
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Textureness filtering ////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
+{
+    float conv = 0;
+    int y1 = y==0? 0 : y-1;
+    int x1 = x==0? 0 : x-1;
+    if(x < cols && y < rows)
+    {
+        conv = (float)input[(y1)  * cols + (x1)] * (-1) + (float)input[(y1)  * cols + (x+1)] * (1) + 
+               (float)input[(y)   * cols + (x1)] * (-2) + (float)input[(y)   * cols + (x+1)] * (2) +
+               (float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
+    
+    }
+    return fabs(conv);
+}
+
+float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
+{
+    float cache = 0;
+    float cache2 = 0;
+    int winsz2 = winsz/2;
+
+    int x = get_local_id(0);
+    int group_size_x = get_local_size(0);
+
+    for(int i = 1; i <= winsz2; i++)
+        cache += cols[i];
+
+    cols_cache[0] = cache;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < group_size_x - winsz2)
+        cache2 = cols_cache[winsz2];
+    else
+        for(int i = winsz2 + 1; i < winsz; i++)
+            cache2 += cols[i];
+
+    return cols[0] + cache + cache2;
+}
+
+#define RpT (2 * ROWSperTHREAD)  // got experimentally
+__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols, 
+                                 int disp_step, __global unsigned char *input, int input_rows, 
+                                 int input_cols,int winsz, float threshold, 
+                                 __local float *cols_cache)
+{
+    int winsz2 = winsz/2;
+    int n_dirty_pixels = (winsz2) * 2;
+
+    int local_id_x = get_local_id(0);
+    int group_size_x = get_local_size(0);
+    int group_id_y = get_group_id(1);
+
+    __local float *cols = cols_cache + group_size_x + local_id_x;
+    __local float *cols_extra = local_id_x < n_dirty_pixels ? cols + group_size_x : 0;
+
+    int x = get_global_id(0);
+    int beg_row = group_id_y * RpT;
+    int end_row = min(beg_row + RpT, disp_rows);
+
+    if (x < disp_cols)
+    {
+        int y = beg_row;
+
+        float sum = 0;
+        float sum_extra = 0;
+
+        for(int i = y - winsz2; i <= y + winsz2; ++i)
+        {
+            sum += sobel(input, x - winsz2, i, input_rows, input_cols);
+            if (cols_extra)
+                sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols);
+        }
+        *cols = sum;
+        if (cols_extra)
+            *cols_extra = sum_extra;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
+        if (sum_win < threshold)
+            disp[y * disp_step + x] = 0;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(int y = beg_row + 1; y < end_row; ++y)
+        {
+            sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + 
+                  sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
+            *cols = sum;
+
+            if (cols_extra)
+            {
+                sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) 
+                            + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
+                *cols_extra = sum_extra;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
+            if (sum_win < threshold)
+                disp[y * disp_step + x] = 0;
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    }
+}
--- a/modules/ocl/src/kernels/stereobp.cl
+++ b/modules/ocl/src/kernels/stereobp.cl
@ -0,0 +1,580 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+///////////////////////////////////////////////////////////////
+/////////////////common///////////////////////////////////////
+/////////////////////////////////////////////////////////////
+short round_short(float v){
+    return convert_short_sat_rte(v); 
+}
+#define FLOAT_MAX 3.402823466e+38f
+typedef struct
+{
+    int   cndisp;
+    float cmax_data_term;
+    float cdata_weight;
+    float cmax_disc_term;
+    float cdisc_single_jump;
+}con_srtuct_t;
+///////////////////////////////////////////////////////////////
+////////////////////////// comp data //////////////////////////
+///////////////////////////////////////////////////////////////
+
+float pix_diff_1(__global const uchar *ls, __global const uchar *rs)
+{
+    return abs((int)(*ls) - *rs); 
+}
+
+float pix_diff_3(__global const uchar *ls, __global const uchar *rs)
+{
+    const float tr = 0.299f;
+    const float tg = 0.587f;
+    const float tb = 0.114f;
+
+    float val;
+            
+    val =  tb * abs((int)ls[0] - rs[0]);
+    val += tg * abs((int)ls[1] - rs[1]);
+    val += tr * abs((int)ls[2] - rs[2]);
+
+    return val;
+}
+float pix_diff_4(__global const uchar *ls, __global const uchar *rs)
+{
+    uchar4 l, r;
+    l = *((__global uchar4 *)ls);
+    r = *((__global uchar4 *)rs);
+
+    const float tr = 0.299f;
+    const float tg = 0.587f;
+    const float tb = 0.114f;
+
+    float val;
+             
+    val  = tb * abs((int)l.x - r.x);
+    val += tg * abs((int)l.y - r.y);
+    val += tr * abs((int)l.z - r.z);
+
+    return val;
+}
+
+__kernel void comp_data_0(__global uchar *left,  int left_rows,  int left_cols,  int left_step,
+                          __global uchar *right, int right_step,
+                          __global short  *data, int data_cols,  int data_step,
+                          __constant con_srtuct_t *con_st, int cn)
+                        //  int cndisp, float cmax_data_term, float cdata_weight, int cn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
+    {
+        const __global uchar* ls = left  + y * left_step  + x * cn;
+        const __global uchar* rs = right + y * right_step + x * cn;
+
+        __global short *ds = (__global short *)((__global uchar *)data + y * data_step) + x;
+
+        const unsigned int disp_step = data_cols * left_rows ;
+
+        for (int disp = 0; disp < con_st -> cndisp; disp++)
+        {
+            if (x - disp >= 1)
+            {
+                float val = 0;
+                if(cn == 1)
+                    val = pix_diff_1(ls, rs - disp * cn);
+                if(cn == 3)
+                    val = pix_diff_3(ls, rs - disp * cn);
+                if(cn == 4)
+                    val = pix_diff_4(ls, rs - disp *cn);
+
+                ds[disp * disp_step] =  round_short(fmin(con_st -> cdata_weight * val, 
+                                                         con_st -> cdata_weight * con_st -> cmax_data_term));
+            }
+            else
+            {
+                ds[disp * disp_step] =  round_short(con_st -> cdata_weight * con_st -> cmax_data_term);
+            }
+        }
+    }
+}
+
+__kernel void comp_data_1(__global uchar *left,  int left_rows,  int left_cols,  int left_step,
+                          __global uchar *right, int right_step,
+                          __global float *data,  int data_cols,  int data_step,
+                          __constant con_srtuct_t *con_st, int cn)
+                          //int cndisp, float cmax_data_term, float cdata_weight, int cn)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y > 0 && y < left_rows - 1 && x > 0 && x < left_cols - 1)
+    {
+        const __global uchar* ls = left  + y * left_step  + x * cn;
+        const __global uchar* rs = right + y * right_step + x * cn;
+
+        __global float *ds = (__global float *)((__global char *)data + y * data_step) + x;
+
+        const unsigned int disp_step = data_cols * left_rows;
+
+        for (int disp = 0; disp < con_st -> cndisp; disp++)
+        {
+            if (x - disp >= 1)
+            {
+                float val = 0;
+                if(cn == 1)
+                    val = pix_diff_1(ls, rs - disp * cn);
+                if(cn == 3)
+                    val = pix_diff_3(ls, rs - disp * cn);
+                if(cn == 4)
+                    val = pix_diff_4(ls, rs - disp *cn);
+
+                ds[disp * disp_step] = fmin(con_st -> cdata_weight * val, 
+                                            con_st -> cdata_weight * con_st -> cmax_data_term);
+            }
+            else
+            {
+                ds[disp * disp_step] = con_st -> cdata_weight * con_st -> cmax_data_term;
+            }
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////
+//////////////////////// data step down ///////////////////////
+///////////////////////////////////////////////////////////////
+__kernel void data_step_down_0(__global short *src, int src_rows, int src_cols, 
+                               __global short *dst, int dst_rows, int dst_cols, int dst_real_cols, 
+                               int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);;
+   
+    if (x < dst_cols && y < dst_rows)
+    {
+        for (int d = 0; d < cndisp; ++d)
+        {
+            //float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+            float dst_reg;
+            dst_reg  = src[(d * src_rows + (2*y+0)) * src_cols + 2*x+0];
+            dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+0];
+            dst_reg += src[(d * src_rows + (2*y+0)) * src_cols + 2*x+1];
+            dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+1];
+              
+            //dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+            dst[(d * dst_rows + y) * dst_real_cols + x] = round_short(dst_reg);
+        }
+    }
+}
+__kernel void data_step_down_1(__global float *src, int src_rows, int src_cols,
+                               __global float *dst, int dst_rows, int dst_cols, int dst_real_cols, 
+                               int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);;
+   
+    if (x < dst_cols && y < dst_rows)
+    {
+        for (int d = 0; d < cndisp; ++d)
+        {
+            //float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+            float dst_reg;
+            dst_reg = src[(d * src_rows + (2*y+0)) * src_cols + 2*x+0];
+            dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+0];
+            dst_reg += src[(d * src_rows + (2*y+0)) * src_cols + 2*x+1];
+            dst_reg += src[(d * src_rows + (2*y+1)) * src_cols + 2*x+1];
+              
+            //dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+            dst[(d * dst_rows + y) * dst_real_cols + x] = round_short(dst_reg);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////
+/////////////////// level up messages  ////////////////////////
+///////////////////////////////////////////////////////////////
+__kernel void level_up_message_0(__global short *src, int src_rows, int src_step,
+                                 __global short *dst, int dst_rows, int dst_cols, int dst_step,
+                                 int cndisp)
+    
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    
+    if (x < dst_cols && y < dst_rows)
+    {
+        const int dst_disp_step = (dst_step / sizeof(short)) * dst_rows;
+        const int src_disp_step = (src_step / sizeof(short)) * src_rows;
+       
+        __global short        *dstr = (__global short *)((__global char *)dst + y   * dst_step) + x;
+        __global const short  *srcr = (__global short *)((__global char *)src + y/2 * src_step) + x/2;
+       
+        for (int d = 0; d < cndisp; ++d)
+            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+    }
+}
+__kernel void level_up_message_1(__global float *src, int src_rows, int src_step,
+                                 __global float *dst, int dst_rows, int dst_cols, int dst_step,
+                                 int cndisp)
+    
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    
+    if (x < dst_cols && y < dst_rows)
+    {
+        const int dst_disp_step = (dst_step/sizeof(float)) * dst_rows;
+        const int src_disp_step = (src_step/sizeof(float)) * src_rows;
+       
+        __global float       *dstr = (__global float *)((__global char *)dst + y   * dst_step) + x;
+        __global const float *srcr = (__global float *)((__global char *)src + y/2 * src_step) + x/2;
+       
+        for (int d = 0; d < cndisp; ++d)
+            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+    }
+}
+
+///////////////////////////////////////////////////////////////
+////////////////////  calc all iterations /////////////////////
+///////////////////////////////////////////////////////////////
+void calc_min_linear_penalty_0(__global short * dst, int disp_step, 
+                               int cndisp, float cdisc_single_jump)
+{
+    float prev = dst[0];
+    float cur;
+
+    for (int disp = 1; disp < cndisp; ++disp)
+    {
+        prev += cdisc_single_jump;
+        cur = dst[disp_step * disp];
+
+        if (prev < cur)
+        {
+            cur = prev;
+            dst[disp_step * disp] = round_short(prev);
+        }
+            
+        prev = cur;
+    }
+        
+    prev = dst[(cndisp - 1) * disp_step];
+    for (int disp = cndisp - 2; disp >= 0; disp--)
+    {
+        prev += cdisc_single_jump;
+        cur = dst[disp_step * disp];
+       
+        if (prev < cur)
+        {
+             cur = prev;
+             dst[disp_step * disp] = round_short(prev);
+        }
+        prev = cur;
+    }
+}
+void message_0(const __global short *msg1, const __global short *msg2,
+               const __global short *msg3, const __global short *data, __global short *dst,
+               int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
+{
+    float minimum = FLOAT_MAX;
+        
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg;
+        dst_reg  = msg1[msg_disp_step * i];
+        dst_reg += msg2[msg_disp_step * i];
+        dst_reg += msg3[msg_disp_step * i];
+        dst_reg += data[data_disp_step * i];
+       
+        if (dst_reg < minimum)
+            minimum = dst_reg;
+           
+        dst[msg_disp_step * i] = round_short(dst_reg);
+    }
+       
+    calc_min_linear_penalty_0(dst, msg_disp_step, cndisp, cdisc_single_jump);
+        
+    minimum += cmax_disc_term;
+
+    float sum = 0;
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg = dst[msg_disp_step * i];
+        if (dst_reg > minimum)
+        {
+            dst_reg = minimum;
+            dst[msg_disp_step * i] = round_short(minimum);
+        }
+        sum += dst_reg;
+    }
+    sum /= cndisp;
+        
+    for(int i = 0; i < cndisp; ++i)
+        dst[msg_disp_step * i] -= sum;
+}
+__kernel void one_iteration_0(__global short *u,    int u_step,    int u_cols,
+                              __global short *data, int data_step, int data_cols,
+                              __global short *d,    __global short *l, __global short *r,
+                              int t, int cols, int rows, 
+                              int cndisp, float cmax_disc_term, float cdisc_single_jump)
+{
+    const int y = get_global_id(1);
+    const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
+    
+    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+    {
+        __global short *us = (__global short *)((__global char *)u + y * u_step) + x;
+        __global short *ds = d + y * u_cols + x;
+        __global short *ls = l + y * u_cols + x;
+        __global short *rs = r + y * u_cols + x;
+        const __global  short *dt = (__global short *)((__global char *)data + y * data_step) + x;
+
+        int msg_disp_step = u_cols * rows;
+        int data_disp_step = data_cols * rows;
+
+        message_0(us + u_cols, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp, 
+                cmax_disc_term, cdisc_single_jump);
+        message_0(ds - u_cols, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp,
+                cmax_disc_term, cdisc_single_jump);
+
+        message_0(us + u_cols, ds - u_cols, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
+                cmax_disc_term, cdisc_single_jump);
+        message_0(us + u_cols, ds - u_cols, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
+                cmax_disc_term, cdisc_single_jump);
+    }
+}
+void calc_min_linear_penalty_1(__global float * dst, int step, 
+                               int cndisp, float cdisc_single_jump)
+{
+    float prev = dst[0];
+    float cur;
+
+    for (int disp = 1; disp < cndisp; ++disp)
+    {
+        prev += cdisc_single_jump;
+        cur = dst[step * disp];
+
+        if (prev < cur)
+        {
+            cur = prev;
+            dst[step * disp] = prev;
+        }
+            
+        prev = cur;
+    }
+        
+    prev = dst[(cndisp - 1) * step];
+    for (int disp = cndisp - 2; disp >= 0; disp--)
+    {
+        prev += cdisc_single_jump;
+        cur = dst[step * disp];
+       
+        if (prev < cur)
+        {
+             cur = prev;
+             dst[step * disp] = prev;
+        }
+        prev = cur;
+    }
+}
+void message_1(const __global float *msg1, const __global float *msg2,
+               const __global float *msg3, const __global float *data, __global float *dst,
+               int msg_disp_step, int data_disp_step, int cndisp, float cmax_disc_term, float cdisc_single_jump)
+{
+    float minimum = FLOAT_MAX; 
+        
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg = 0;
+        dst_reg  = msg1[msg_disp_step * i];
+        dst_reg += msg2[msg_disp_step * i];
+        dst_reg += msg3[msg_disp_step * i];
+        dst_reg += data[data_disp_step * i];
+       
+        if (dst_reg < minimum)
+            minimum = dst_reg;
+           
+        dst[msg_disp_step * i] = dst_reg;
+    }
+       
+    calc_min_linear_penalty_1(dst, msg_disp_step, cndisp, cdisc_single_jump);
+        
+    minimum += cmax_disc_term;
+
+    float sum = 0;
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg = dst[msg_disp_step * i];
+        if (dst_reg > minimum)
+        {
+            dst_reg = minimum;
+            dst[msg_disp_step * i] = minimum;
+        }
+        sum += dst_reg;
+    }
+    sum /= cndisp;
+        
+    for(int i = 0; i < cndisp; ++i)
+        dst[msg_disp_step * i] -= sum;
+}
+__kernel void one_iteration_1(__global float *u,    int u_step,    int u_cols,
+                              __global float *data, int data_step, int data_cols,
+                              __global float *d,    __global float *l, __global float *r,
+                              int t, int cols, int rows, 
+                              int cndisp,float cmax_disc_term, float cdisc_single_jump)
+{
+    const int y = get_global_id(1);
+    const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
+    
+    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+    {
+        __global float* us = (__global float *)((__global char *)u + y * u_step) + x;
+        __global float* ds = d + y * u_cols + x;
+        __global float* ls = l + y * u_cols + x;
+        __global float* rs = r + y * u_cols + x;
+        const __global float* dt = (__global float *)((__global char *)data + y * data_step) + x;
+
+        int msg_disp_step = u_cols * rows;
+        int data_disp_step = data_cols * rows;
+
+        message_1(us + u_cols, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step, cndisp,
+                cmax_disc_term, cdisc_single_jump);
+        message_1(ds - u_cols, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step, cndisp, 
+                cmax_disc_term, cdisc_single_jump);
+        message_1(us + u_cols, ds - u_cols, rs - 1, dt, rs, msg_disp_step, data_disp_step, cndisp,
+                cmax_disc_term, cdisc_single_jump);
+        message_1(us + u_cols, ds - u_cols, ls + 1, dt, ls, msg_disp_step, data_disp_step, cndisp,
+                cmax_disc_term, cdisc_single_jump);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+/////////////////////////// output ////////////////////////////
+///////////////////////////////////////////////////////////////
+__kernel void output_0(const __global short *u, int u_step, int u_cols,
+                       const __global short *d, const __global short *l,
+                       const __global short *r, const __global short *data,
+                       __global short *disp, int disp_rows, int disp_cols, int disp_step,
+                       int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+   
+    if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
+    {
+        const __global short *us =(__global short *)((__global char *)u + (y + 1) * u_step) + x;
+        const __global short *ds = d + (y - 1) * u_cols + x;
+        const __global short *ls = l + y * u_cols + (x + 1);
+        const __global short *rs = r + y * u_cols + (x - 1);
+        const __global short *dt = data + y * u_cols + x;
+       
+        int disp_steps = disp_rows * u_cols;
+
+        int best = 0;
+        float best_val = FLOAT_MAX;
+        for (int d = 0; d < cndisp; ++d)
+        {
+            float val;
+            val  = us[d * disp_steps];
+            val += ds[d * disp_steps];
+            val += ls[d * disp_steps];
+            val += rs[d * disp_steps];
+            val += dt[d * disp_steps];
+           
+            if (val < best_val)
+            {
+                best_val = val;
+                best = d;
+            }
+        }
+            
+        ((__global short *)((__global char *)disp + y * disp_step))[x] = convert_short_sat(best);
+    }
+}
+__kernel void output_1(const __global float *u, int u_step, int u_cols,
+                       const __global float *d, const __global float *l,
+                       const __global float *r, const __global float *data,
+                       __global short *disp, int disp_rows, int disp_cols, int disp_step,
+                       int cndisp)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+   
+    if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
+    {
+        const __global float *us =(__global float *)((__global char *)u + (y + 1) * u_step) + x;
+        const __global float *ds = d + (y - 1) * u_cols + x;
+        const __global float *ls = l + y * u_cols + (x + 1);
+        const __global float *rs = r + y * u_cols + (x - 1);
+        const __global float *dt = data + y * u_cols + x;
+       
+        int disp_steps = disp_rows * u_cols;
+       
+        int best = 0;
+        float best_val = FLOAT_MAX;
+        for (int d = 0; d < cndisp; ++d)
+        {
+            float val;
+            val  = us[d * disp_steps];
+            val += ds[d * disp_steps];
+            val += ls[d * disp_steps];
+            val += rs[d * disp_steps];
+            val += dt[d * disp_steps];
+           
+            if (val < best_val)
+            {
+                best_val = val;
+                best = d;
+            }
+        }
+            
+        //disp[y * disp_cols + x] = convert_short_sat(best);
+        ((__global short *)((__global char *)disp + y * disp_step))[x] = convert_short_sat(best);
+    }
+}
--- a/modules/ocl/src/kernels/stereocsbp.cl
+++ b/modules/ocl/src/kernels/stereocsbp.cl
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@ -0,0 +1,562 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#define ALIGN 32 
+#define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+////////////////////////////////////////////////////////////////////////
+//////////////////////////////// oclMat ////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if !defined (HAVE_OPENCL)
+
+namespace cv
+{
+    namespace ocl
+    {
+        void oclMat::upload(const Mat& /*m*/)
+        {
+            throw_nogpu();
+        }
+        void oclMat::download(cv::Mat& /*m*/) const
+        {
+            throw_nogpu();
+        }
+        void oclMat::copyTo( oclMat& /*m*/ ) const
+        {
+            throw_nogpu();
+        }
+        void oclMat::copyTo( oclMat& /*m*/, const oclMat&/* mask */) const
+        {
+            throw_nogpu();
+        }
+        void oclMat::convertTo( oclMat& /*m*/, int /*rtype*/, double /*alpha*/, double /*beta*/ ) const
+        {
+            throw_nogpu();
+        }
+        oclMat &oclMat::operator = (const Scalar& /*s*/)
+        {
+            throw_nogpu();
+            return *this;
+        }
+        oclMat &oclMat::setTo(const Scalar& /*s*/, const oclMat& /*mask*/)
+        {
+            throw_nogpu();
+            return *this;
+        }
+        oclMat oclMat::reshape(int /*new_cn*/, int /*new_rows*/) const
+        {
+            throw_nogpu();
+            return oclMat();
+        }
+        void oclMat::create(int /*_rows*/, int /*_cols*/, int /*_type*/)
+        {
+            throw_nogpu();
+        }
+        void oclMat::release()
+        {
+            throw_nogpu();
+        }
+    }
+}
+
+#else /* !defined (HAVE_OPENCL) */
+
+//helper routines
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *operator_copyToM;
+        extern const char *operator_convertTo;
+        extern const char *operator_setTo;
+        extern const char *operator_setToM;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// convert_C3C4
+void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep)
+{
+    int dstStep = dst.step1() / dst.channels();
+    Context *clCxt = dst.clCxt;
+    string kernelName = "convertC3C4";
+
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
+
+    size_t globalThreads[3] = {(dst.wholecols *dst.wholerows + 255) / 256 * 256, 1, 1};
+    size_t localThreads[3] = {256, 1, 1};
+
+    openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, dst.elemSize1() >> 1);
+}
+////////////////////////////////////////////////////////////////////////
+// convert_C4C3
+void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep)
+{
+    int srcStep = src.step1() / src.channels();
+    Context *clCxt = src.clCxt;
+    string kernelName = "convertC4C3";
+
+    vector< pair<size_t, const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
+
+    size_t globalThreads[3] = {(src.wholecols *src.wholerows + 255) / 256 * 256, 1, 1};
+    size_t localThreads[3] = {256, 1, 1};
+
+    openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, src.elemSize1() >> 1);
+}
+
+void cv::ocl::oclMat::upload(const Mat &m)
+{
+    CV_DbgAssert(!m.empty());
+    Size wholeSize;
+    Point ofs;
+    m.locateROI(wholeSize, ofs);
+    int type = m.type();
+    //if(m.channels() == 3)
+    //type = CV_MAKETYPE(m.depth(), 4);
+    create(wholeSize, type);
+
+    //if(m.channels() == 3)
+    //{
+    //int pitch = GPU_MATRIX_MALLOC_STEP(wholeSize.width * 3 * m.elemSize1());
+    //int err;
+    //cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE,
+    //pitch*wholeSize.height,0,&err);
+    //CV_DbgAssert(err==0);
+
+    //openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice);
+    //convert_C3C4(temp, *this, pitch);
+    //}
+    //else
+    openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
+
+    rows = m.rows;
+    cols = m.cols;
+    offset = ofs.y * step + ofs.x * elemSize();
+    download_channels = m.channels();
+}
+
+void cv::ocl::oclMat::download(cv::Mat &m) const
+{
+    CV_DbgAssert(!this->empty());
+    int t = type();
+    //if(download_channels == 3)
+    //t = CV_MAKETYPE(depth(), 3);
+    m.create(wholerows, wholecols, t);
+
+    //if(download_channels == 3)
+    //{
+    //int pitch = GPU_MATRIX_MALLOC_STEP(wholecols * 3 * m.elemSize1());
+    //int err;
+    //cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE,
+    //pitch*wholerows,0,&err);
+    //CV_DbgAssert(err==0);
+
+    //convert_C4C3(*this, temp, pitch/m.elemSize1());
+    //openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost);
+    //}
+    //else
+    openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
+    Size wholesize;
+    Point ofs;
+    locateROI(wholesize, ofs);
+    m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
+}
+
+/////////////////////common//////////////////////////////////////
+inline int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+///////////////////////////////////////////////////////////////////////////
+////////////////////////////////// CopyTo /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName)
+{
+    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
+                  src.rows == dst.rows && src.cols == dst.cols);
+
+    vector<pair<size_t , const void *> > args;
+
+    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
+        {2, 2, 1, 1, 1, 1, 1},
+        {8, 8, 8, 8 , 4, 4, 4},      //vector length is undefined when channels = 3
+        {1, 1, 1, 1, 1, 1, 1}
+    };
+
+    size_t localThreads[3] = {16, 16, 1};
+    size_t globalThreads[3];
+
+    int vector_length = vector_lengths[dst.channels() -1][dst.depth()];
+    int offset_cols = divUp(dst.offset, dst.elemSize()) & (vector_length - 1);
+    int cols = vector_length == 1 ? divUp(dst.cols, vector_length) : divUp(dst.cols + offset_cols, vector_length);
+
+    globalThreads[0] = divUp(cols, localThreads[0]) * localThreads[0];
+    globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
+    globalThreads[2] = 1;
+
+    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
+    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
+
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
+
+    openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
+                        localThreads, args, dst.channels(), dst.depth());
+}
+
+void cv::ocl::oclMat::copyTo( oclMat &m ) const
+{
+    CV_DbgAssert(!this->empty());
+    m.create(size(), type());
+    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
+                       data, step, cols * elemSize(), rows, offset, clMemcpyDeviceToDevice);
+}
+
+void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
+{
+    if (mask.empty())
+    {
+        copyTo(mat);
+    }
+    else
+    {
+        mat.create(size(), type());
+        copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//////////////////////////////// ConvertTo ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
+{
+    string kernelName = "convert_to_S";
+    stringstream idxStr;
+    idxStr << src.depth();
+    kernelName += idxStr.str();
+    float alpha_f = alpha, beta_f = beta;
+    CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
+    vector<pair<size_t , const void *> > args;
+    size_t localThreads[3] = {16, 16, 1};
+    size_t globalThreads[3];
+    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
+    globalThreads[2] = 1;
+    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
+    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
+    if(dst.type() == CV_8UC1)
+    {
+        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
+    }
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f ));
+    args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f ));
+    openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
+                        localThreads, args, dst.channels(), dst.depth());
+}
+void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
+{
+    //cout << "cv::ocl::oclMat::convertTo()" << endl;
+
+    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
+                   && fabs(beta) < std::numeric_limits<double>::epsilon();
+
+    if( rtype < 0 )
+        rtype = type();
+    else
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
+
+    //int scn = channels();
+    int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
+    if( sdepth == ddepth && noScale )
+    {
+        copyTo(dst);
+        return;
+    }
+
+    oclMat temp;
+    const oclMat *psrc = this;
+    if( sdepth != ddepth && psrc == &dst )
+        psrc = &(temp = *this);
+
+    dst.create( size(), rtype );
+    convert_run(*psrc, dst, alpha, beta);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//////////////////////////////// setTo ////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
+{
+    //cout << "cv::ocl::oclMat::=" << endl;
+    setTo(s);
+    return *this;
+}
+void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kernelName)
+{
+    vector<pair<size_t , const void *> > args;
+    cl_float4 val;
+    val.s[0] = scalar.val[0];
+    val.s[1] = scalar.val[1];
+    val.s[2] = scalar.val[2];
+    val.s[3] = scalar.val[3];
+    size_t localThreads[3] = {16, 16, 1};
+    size_t globalThreads[3];
+    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
+    globalThreads[2] = 1;
+    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
+    if(dst.type() == CV_8UC1)
+    {
+        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+    }
+    args.push_back( make_pair( sizeof(cl_float4) , (void *)&val ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
+    openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
+                        localThreads, args, dst.channels(), dst.depth());
+}
+
+void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName)
+{
+    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
+    vector<pair<size_t , const void *> > args;
+    cl_float4 val;
+    val.s[0] = scalar.val[0];
+    val.s[1] = scalar.val[1];
+    val.s[2] = scalar.val[2];
+    val.s[3] = scalar.val[3];
+    size_t localThreads[3] = {16, 16, 1};
+    size_t globalThreads[3];
+    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
+    globalThreads[2] = 1;
+    if(dst.type() == CV_8UC1)
+    {
+        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
+    }
+    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
+    args.push_back( make_pair( sizeof(cl_float4) , (void *)&val ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
+    openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
+                        localThreads, args, dst.channels(), dst.depth());
+}
+
+oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
+{
+    //cout << "cv::ocl::oclMat::setTo()" << endl;
+    CV_Assert(mask.type() == CV_8UC1);
+    CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
+    CV_DbgAssert( !this->empty());
+    //cl_int status;
+    //cl_mem mem;
+    //mem = clCreateBuffer(this->clCxt->clContext,CL_MEM_READ_WRITE,
+    //                   sizeof(double)*4,NULL,&status);
+    //openCLVerifyCall(status);
+    //double* s =  (double *)scalar.val;
+    //openCLSafeCall(clEnqueueWriteBuffer(this->clCxt->clCmdQueue,
+    //                   (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
+    if (mask.empty())
+    {
+        set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
+    }
+    else
+    {
+        set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
+    }
+
+    return *this;
+}
+
+oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
+{
+	if( new_rows != 0 && new_rows != rows)
+	{
+		 CV_Error( CV_StsBadFunc,
+            "oclMat's number of rows can not be changed for current version" );
+	}
+
+	oclMat hdr = *this;
+
+    int cn = channels();
+    if (new_cn == 0)
+        new_cn = cn;
+
+    int total_width = cols * cn;
+
+    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
+        new_rows = rows * total_width / new_cn;
+
+    if (new_rows != 0 && new_rows != rows)
+    {
+        int total_size = total_width * rows;
+
+        if (!isContinuous())
+            CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
+
+        if ((unsigned)new_rows > (unsigned)total_size)
+            CV_Error(CV_StsOutOfRange, "Bad new number of rows");
+
+        total_width = total_size / new_rows;
+
+        if (total_width * new_rows != total_size)
+            CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
+
+        hdr.rows = new_rows;
+        hdr.step = total_width * elemSize1();
+    }
+
+    int new_width = total_width / new_cn;
+
+    if (new_width * new_cn != total_width)
+        CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
+
+    hdr.cols = new_width;
+    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
+
+    return hdr;
+
+}
+
+void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
+{
+    clCxt = Context::getContext();
+    //cout << "cv::ocl::oclMat::create()." << endl;
+
+    /* core logic */
+    _type &= TYPE_MASK;
+    if( rows == _rows && cols == _cols && type() == _type && data )
+        return;
+    if( data )
+        release();
+    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
+    if( _rows > 0 && _cols > 0 )
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+        wholerows = _rows;
+        wholecols = _cols;
+        size_t esz = elemSize();
+
+        void *dev_ptr;
+        openCLMallocPitch(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows);
+        //openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);
+
+        if (esz *cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        int64 _nettosize = (int64)step * rows;
+        size_t nettosize = (size_t)_nettosize;
+
+        datastart = data = (uchar *)dev_ptr;
+        dataend = data + nettosize;
+
+        refcount = (int *)fastMalloc(sizeof(*refcount));
+        *refcount = 1;
+    }
+}
+
+void cv::ocl::oclMat::release()
+{
+    //cout << "cv::ocl::oclMat::release()" << endl;
+    if( refcount && CV_XADD(refcount, -1) == 1 )
+    {
+        fastFree(refcount);
+        openCLFree(datastart);
+    }
+    data = datastart = dataend = 0;
+    step = rows = cols = 0;
+    offset = wholerows = wholecols = 0;
+    refcount = 0;
+}
+
+#endif /* !defined (HAVE_OPENCL) */
--- a/modules/ocl/src/mssegmentation.cpp
+++ b/modules/ocl/src/mssegmentation.cpp
@ -0,0 +1,414 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined(HAVE_OPENCL)
+
+namespace cv
+{
+    namespace ocl
+    {
+
+        void meanShiftSegmentation(const oclMat &, Mat &, int, int, int, TermCriteria)
+        {
+            throw_nogpu();
+        }
+
+    }
+}
+
+#else
+
+using namespace std;
+
+// Auxiliray stuff
+namespace
+{
+
+    //
+    // Declarations
+    //
+
+    class DjSets
+    {
+    public:
+        DjSets(int n);
+        int find(int elem);
+        int merge(int set1, int set2);
+
+        vector<int> parent;
+        vector<int> rank;
+        vector<int> size;
+    private:
+        DjSets(const DjSets &) {}
+        DjSets operator =(const DjSets &);
+    };
+
+    template <typename T>
+    struct GraphEdge
+    {
+        GraphEdge() {}
+        GraphEdge(int to, int next, const T &val) : to(to), next(next), val(val) {}
+        int to;
+        int next;
+        T val;
+    };
+
+
+    template <typename T>
+    class Graph
+    {
+    public:
+        typedef GraphEdge<T> Edge;
+
+        Graph(int numv, int nume_max);
+
+        void addEdge(int from, int to, const T &val = T());
+
+        vector<int> start;
+        vector<Edge> edges;
+
+        int numv;
+        int nume_max;
+        int nume;
+    private:
+        Graph(const Graph &) {}
+        Graph operator =(const Graph &) {}
+    };
+
+
+    struct SegmLinkVal
+    {
+        SegmLinkVal() {}
+        SegmLinkVal(int dr, int dsp) : dr(dr), dsp(dsp) {}
+        bool operator <(const SegmLinkVal &other) const
+        {
+            return dr + dsp < other.dr + other.dsp;
+        }
+        int dr;
+        int dsp;
+    };
+
+
+    struct SegmLink
+    {
+        SegmLink() {}
+        SegmLink(int from, int to, const SegmLinkVal &val)
+            : from(from), to(to), val(val) {}
+        bool operator <(const SegmLink &other) const
+        {
+            return val < other.val;
+        }
+        int from;
+        int to;
+        SegmLinkVal val;
+    };
+
+    //
+    // Implementation
+    //
+
+    DjSets DjSets::operator = (const DjSets &obj)
+    {
+        //cout << "Invalid DjSets constructor\n";
+        CV_Error(-1, "Invalid DjSets constructor\n");
+        return *this;
+    }
+
+    DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
+    {
+        for (int i = 0; i < n; ++i)
+            parent[i] = i;
+    }
+
+
+    inline int DjSets::find(int elem)
+    {
+        int set = elem;
+        while (set != parent[set])
+            set = parent[set];
+        while (elem != parent[elem])
+        {
+            int next = parent[elem];
+            parent[elem] = set;
+            elem = next;
+        }
+        return set;
+    }
+
+
+    inline int DjSets::merge(int set1, int set2)
+    {
+        if (rank[set1] < rank[set2])
+        {
+            parent[set1] = set2;
+            size[set2] += size[set1];
+            return set2;
+        }
+        if (rank[set2] < rank[set1])
+        {
+            parent[set2] = set1;
+            size[set1] += size[set2];
+            return set1;
+        }
+        parent[set1] = set2;
+        rank[set2]++;
+        size[set2] += size[set1];
+        return set2;
+    }
+
+
+    template <typename T>
+    Graph<T>::Graph(int numv, int nume_max) : start(numv, -1), edges(nume_max)
+    {
+        this->numv = numv;
+        this->nume_max = nume_max;
+        nume = 0;
+    }
+
+
+    template <typename T>
+    inline void Graph<T>::addEdge(int from, int to, const T &val)
+    {
+        edges[nume] = Edge(to, start[from], val);
+        start[from] = nume;
+        nume++;
+    }
+
+
+    inline int pix(int y, int x, int ncols)
+    {
+        return y * ncols + x;
+    }
+
+
+    inline int sqr(int x)
+    {
+        return x * x;
+    }
+
+
+    inline int dist2(const cv::Vec4b &lhs, const cv::Vec4b &rhs)
+    {
+        return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
+    }
+
+
+    inline int dist2(const cv::Vec2s &lhs, const cv::Vec2s &rhs)
+    {
+        return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
+    }
+
+} // anonymous namespace
+
+namespace cv
+{
+    namespace ocl
+    {
+
+        void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize, TermCriteria criteria)
+        {
+            CV_Assert(src.type() == CV_8UC4);
+            const int nrows = src.rows;
+            const int ncols = src.cols;
+            const int hr = sr;
+            const int hsp = sp;
+
+            // Perform mean shift procedure and obtain region and spatial maps
+            oclMat h_rmap, h_spmap;
+            meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria);
+            Mat rmap = h_rmap;
+            Mat spmap = h_spmap;
+
+            Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
+                                 + (nrows - 1) + (ncols - 1));
+
+            // Make region adjacent graph from image
+            Vec4b r1;
+            Vec4b r2[4];
+            Vec2s sp1;
+            Vec2s sp2[4];
+            int dr[4];
+            int dsp[4];
+            for (int y = 0; y < nrows - 1; ++y)
+            {
+                Vec4b *ry = rmap.ptr<Vec4b>(y);
+                Vec4b *ryp = rmap.ptr<Vec4b>(y + 1);
+                Vec2s *spy = spmap.ptr<Vec2s>(y);
+                Vec2s *spyp = spmap.ptr<Vec2s>(y + 1);
+                for (int x = 0; x < ncols - 1; ++x)
+                {
+                    r1 = ry[x];
+                    sp1 = spy[x];
+
+                    r2[0] = ry[x + 1];
+                    r2[1] = ryp[x];
+                    r2[2] = ryp[x + 1];
+                    r2[3] = ryp[x];
+
+                    sp2[0] = spy[x + 1];
+                    sp2[1] = spyp[x];
+                    sp2[2] = spyp[x + 1];
+                    sp2[3] = spyp[x];
+
+                    dr[0] = dist2(r1, r2[0]);
+                    dr[1] = dist2(r1, r2[1]);
+                    dr[2] = dist2(r1, r2[2]);
+                    dsp[0] = dist2(sp1, sp2[0]);
+                    dsp[1] = dist2(sp1, sp2[1]);
+                    dsp[2] = dist2(sp1, sp2[2]);
+
+                    r1 = ry[x + 1];
+                    sp1 = spy[x + 1];
+
+                    dr[3] = dist2(r1, r2[3]);
+                    dsp[3] = dist2(sp1, sp2[3]);
+
+                    g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+                    g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
+                    g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
+                    g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
+                }
+            }
+            for (int y = 0; y < nrows - 1; ++y)
+            {
+                r1 = rmap.at<Vec4b>(y, ncols - 1);
+                r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
+                sp1 = spmap.at<Vec2s>(y, ncols - 1);
+                sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
+                dr[0] = dist2(r1, r2[0]);
+                dsp[0] = dist2(sp1, sp2[0]);
+                g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+            }
+            for (int x = 0; x < ncols - 1; ++x)
+            {
+                r1 = rmap.at<Vec4b>(nrows - 1, x);
+                r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
+                sp1 = spmap.at<Vec2s>(nrows - 1, x);
+                sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
+                dr[0] = dist2(r1, r2[0]);
+                dsp[0] = dist2(sp1, sp2[0]);
+                g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+            }
+
+            DjSets comps(g.numv);
+
+            // Find adjacent components
+            for (int v = 0; v < g.numv; ++v)
+            {
+                for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+                {
+                    int c1 = comps.find(v);
+                    int c2 = comps.find(g.edges[e_it].to);
+                    if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
+                        comps.merge(c1, c2);
+                }
+            }
+
+            vector<SegmLink> edges;
+            edges.reserve(g.numv);
+
+            // Prepare edges connecting differnet components
+            for (int v = 0; v < g.numv; ++v)
+            {
+                int c1 = comps.find(v);
+                for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+                {
+                    int c2 = comps.find(g.edges[e_it].to);
+                    if (c1 != c2)
+                        edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
+                }
+            }
+
+            // Sort all graph's edges connecting differnet components (in asceding order)
+            sort(edges.begin(), edges.end());
+
+            // Exclude small components (starting from the nearest couple)
+            for (size_t i = 0; i < edges.size(); ++i)
+            {
+                int c1 = comps.find(edges[i].from);
+                int c2 = comps.find(edges[i].to);
+                if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
+                    comps.merge(c1, c2);
+            }
+
+            // Compute sum of the pixel's colors which are in the same segment
+            Mat h_src = src;
+            vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
+            for (int y = 0; y < nrows; ++y)
+            {
+                Vec4b *h_srcy = h_src.ptr<Vec4b>(y);
+                for (int x = 0; x < ncols; ++x)
+                {
+                    int parent = comps.find(pix(y, x, ncols));
+                    Vec4b col = h_srcy[x];
+                    Vec4i &sumcol = sumcols[parent];
+                    sumcol[0] += col[0];
+                    sumcol[1] += col[1];
+                    sumcol[2] += col[2];
+                }
+            }
+
+            // Create final image, color of each segment is the average color of its pixels
+            dst.create(src.size(), src.type());
+
+            for (int y = 0; y < nrows; ++y)
+            {
+                Vec4b *dsty = dst.ptr<Vec4b>(y);
+                for (int x = 0; x < ncols; ++x)
+                {
+                    int parent = comps.find(pix(y, x, ncols));
+                    const Vec4i &sumcol = sumcols[parent];
+                    Vec4b &dstcol = dsty[x];
+                    dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
+                    dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
+                    dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+                }
+            }
+        }
+
+    }
+}
+#endif // #if !defined (HAVE_OPENCL)
--- a/modules/ocl/src/precomp.cpp
+++ b/modules/ocl/src/precomp.cpp
@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+//CriticalSection cs;
+/* End of file. */
--- a/Show More
+++ b/Show More