diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index 4e510ffa3..378232873 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -231,9 +231,11 @@ if(ENABLE_SOLUTION_FOLDERS)
   set_target_properties(tbb PROPERTIES FOLDER "3rdparty")
 endif()
 
-if(NOT BUILD_SHARED_LIBS)
-  install(TARGETS tbb ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
-endif()
+install(TARGETS tbb
+    RUNTIME DESTINATION bin COMPONENT main
+    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
+    ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main
+    )
 
 # get TBB version
 ocv_parse_header("${tbb_src_dir}/include/tbb/tbb_stddef.h" TBB_VERSION_LINES TBB_VERSION_MAJOR TBB_VERSION_MINOR TBB_INTERFACE_VERSION CACHE)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index b936065ce..1c30ab77d 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -522,11 +522,7 @@ macro(ocv_create_module)
 
   if((NOT DEFINED OPENCV_MODULE_TYPE AND BUILD_SHARED_LIBS)
       OR (DEFINED OPENCV_MODULE_TYPE AND OPENCV_MODULE_TYPE STREQUAL SHARED))
-    if(MSVC)
-      set_target_properties(${the_module} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
-    else()
-      add_definitions(-DCVAPI_EXPORTS)
-    endif()
+    set_target_properties(${the_module} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
   endif()
 
   if(MSVC)
diff --git a/cmake/OpenCVPCHSupport.cmake b/cmake/OpenCVPCHSupport.cmake
index 060965346..8af30f115 100644
--- a/cmake/OpenCVPCHSupport.cmake
+++ b/cmake/OpenCVPCHSupport.cmake
@@ -25,11 +25,13 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
 
     SET(_PCH_include_prefix "-I")
     SET(_PCH_isystem_prefix "-isystem")
+    SET(_PCH_define_prefix "-D")
 
 ELSEIF(CMAKE_GENERATOR MATCHES "^Visual.*$")
     SET(PCHSupport_FOUND TRUE)
     SET(_PCH_include_prefix "/I")
     SET(_PCH_isystem_prefix "/I")
+    SET(_PCH_define_prefix "/D")
 ELSE()
     SET(PCHSupport_FOUND FALSE)
 ENDIF()
@@ -244,6 +246,14 @@ MACRO(ADD_PRECOMPILED_HEADER _targetName _input)
 
     _PCH_GET_COMPILE_FLAGS(_compile_FLAGS)
 
+    get_target_property(type ${_targetName} TYPE)
+    if(type STREQUAL "SHARED_LIBRARY")
+        get_target_property(__DEFINES ${_targetName} DEFINE_SYMBOL)
+        if(NOT __DEFINES MATCHES __DEFINES-NOTFOUND)
+            list(APPEND _compile_FLAGS "${_PCH_define_prefix}${__DEFINES}")
+        endif()
+    endif()
+
     #MESSAGE("_compile_FLAGS: ${_compile_FLAGS}")
     #message("COMMAND ${CMAKE_CXX_COMPILER}	${_compile_FLAGS} -x c++-header -o ${_output} ${_input}")
 
diff --git a/cmake/templates/cvconfig.h.cmake b/cmake/templates/cvconfig.h.cmake
index 56c5d5aad..d64399dbd 100644
--- a/cmake/templates/cvconfig.h.cmake
+++ b/cmake/templates/cvconfig.h.cmake
@@ -52,9 +52,6 @@
 /* IEEE1394 capturing support */
 #cmakedefine HAVE_DC1394
 
-/* libdc1394 0.9.4 or 0.9.5 */
-#cmakedefine HAVE_DC1394_095
-
 /* IEEE1394 capturing support - libdc1394 v2.x */
 #cmakedefine HAVE_DC1394_2
 
diff --git a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
index 76c5a4541..171d2e683 100644
--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
@@ -21,7 +21,7 @@ OpenCV has been around since 2001. In those days the library was built around a
 
 Luckily C++ came around and introduced the concept of classes making easier for the user through automatic memory management (more or less). The good news is that C++ is fully compatible with C so no compatibility issues can arise from making the change. Therefore, OpenCV 2.0 introduced a new C++ interface which offered a new way of doing things which means you do not need to fiddle with memory management, making your code concise (less to write, to achieve more). The main downside of the C++ interface is that many embedded development systems at the moment support only C. Therefore, unless you are targeting embedded platforms, there's no point to using the *old* methods (unless you're a masochist programmer and you're asking for trouble).
 
-The first thing you need to know about *Mat* is that you no longer need to manually allocate its memory and release it as soon as you do not need it. While doing this is still a possibility, most of the OpenCV functions will allocate its output data manually. As a nice bonus if you pass on an already existing *Mat* object, which has already  allocated the required space for the matrix, this will be reused. In other words we use at all times only as much memory as we need to perform the task.
+The first thing you need to know about *Mat* is that you no longer need to manually allocate its memory and release it as soon as you do not need it. While doing this is still a possibility, most of the OpenCV functions will allocate its output data automatically. As a nice bonus if you pass on an already existing *Mat* object, which has already  allocated the required space for the matrix, this will be reused. In other words we use at all times only as much memory as we need to perform the task.
 
 *Mat* is basically a class with two data parts: the matrix header (containing information such as the size of the matrix, the method used for storing, at which address is the matrix stored, and so on) and a pointer to the matrix containing the pixel values (taking any dimensionality depending on the method chosen for storing) . The matrix header size is constant, however the size of the matrix itself may vary from image to image and usually is larger by orders of magnitude.
 
diff --git a/doc/tutorials/definitions/tocDefinitions.rst b/doc/tutorials/definitions/tocDefinitions.rst
index c918aa2e8..c036b0b70 100644
--- a/doc/tutorials/definitions/tocDefinitions.rst
+++ b/doc/tutorials/definitions/tocDefinitions.rst
@@ -11,4 +11,5 @@
 .. |Author_EricCh| unicode:: Eric U+0020 Christiansen
 .. |Author_AndreyP| unicode:: Andrey U+0020 Pavlenko
 .. |Author_AlexS| unicode:: Alexander U+0020 Smorkalov
+.. |Author_BarisD| unicode:: Bar U+0131 U+015F U+0020 Evrim U+0020 Demir U+00F6 z
 .. |Author_DomenicoB| unicode:: Domenico U+0020 Daniele U+0020 Bloisi
diff --git a/doc/tutorials/features2d/feature_detection/feature_detection.rst b/doc/tutorials/features2d/feature_detection/feature_detection.rst
index 02da6d080..7705a1371 100644
--- a/doc/tutorials/features2d/feature_detection/feature_detection.rst
+++ b/doc/tutorials/features2d/feature_detection/feature_detection.rst
@@ -30,6 +30,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
    #include <iostream>
    #include "opencv2/core.hpp"
    #include "opencv2/features2d.hpp"
+   #include "opencv2/nonfree/features2d.hpp"
    #include "opencv2/highgui.hpp"
    #include "opencv2/nonfree.hpp"
 
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png b/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png
deleted file mode 100644
index 84c152e6d..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_main_class.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png b/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png
deleted file mode 100644
index 34e03972e..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_new_java_prj.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_run.png b/doc/tutorials/introduction/desktop_java/images/eclipse_run.png
deleted file mode 100644
index fee34afa1..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_run.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png
deleted file mode 100644
index 11694526a..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png
deleted file mode 100644
index 2b9ec5c3c..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib2.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png
deleted file mode 100644
index 4bf83ee03..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib3.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png
deleted file mode 100644
index c3f353155..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib4.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png
deleted file mode 100644
index ed79d92d4..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib5.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png
deleted file mode 100644
index 3a98e38b1..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib6.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png
deleted file mode 100644
index 019432016..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib7.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png b/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png
deleted file mode 100644
index 5650aa79a..000000000
Binary files a/doc/tutorials/introduction/desktop_java/images/eclipse_user_lib8.png and /dev/null differ
diff --git a/doc/tutorials/introduction/desktop_java/java_dev_intro.rst b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
index 1b20bec25..d5cb31f89 100644
--- a/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
+++ b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
@@ -7,10 +7,9 @@ Introduction to Java Development
 
 As of OpenCV 2.4.4, OpenCV supports desktop Java development using nearly the same interface as for
 Android development. This guide will help you to create your first Java (or Scala) application using OpenCV.
-We will use either `Eclipse <http://eclipse.org/>`_, `Apache Ant <http://ant.apache.org/>`_ or the
-`Simple Build Tool (SBT) <http://www.scala-sbt.org/>`_ to build the application.
+We will use either `Apache Ant <http://ant.apache.org/>`_ or `Simple Build Tool (SBT) <http://www.scala-sbt.org/>`_ to build the application.
 
-For further reading after this guide, look at the :ref:`Android_Dev_Intro` tutorials.
+If you want to use Eclipse head to :ref:`Java_Eclipse`. For further reading after this guide, look at the :ref:`Android_Dev_Intro` tutorials.
 
 What we'll do in this guide
 ===========================
@@ -19,7 +18,7 @@ In this guide, we will:
 
 * Get OpenCV with desktop Java support
 
-* Create an ``Ant``, ``Eclipse`` or ``SBT`` project
+* Create an ``Ant`` or ``SBT`` project
 
 * Write a simple OpenCV application in Java or Scala
 
@@ -233,97 +232,6 @@ Java sample with Ant
         :alt: run app with Ant
         :align: center
 
-Java project in Eclipse
-=======================
-
-Now let's look at the possiblity of using OpenCV in Java when developing in Eclipse IDE.
-
-* Create a new Eclipse workspace
-* Create a new Java project via :guilabel:`File --> New --> Java Project`
-
-  .. image:: images/eclipse_new_java_prj.png
-     :alt: Eclipse: new Java project
-     :align: center
-
-  Call it say "HelloCV".
-
-* Open :guilabel:`Java Build Path` tab on :guilabel:`Project Properties` dialog
-  and configure additional library (OpenCV) reference (jar and native library location):
-
-  .. image:: images/eclipse_user_lib.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib2.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib3.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib4.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib5.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib6.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib7.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-  |
-
-  .. image:: images/eclipse_user_lib8.png
-     :alt: Eclipse: external JAR
-     :align: center
-
-
-* Add a new Java class (say ``Main``) containing the application entry:
-
-  .. image:: images/eclipse_main_class.png
-     :alt: Eclipse: Main class
-     :align: center
-
-* Put some simple OpenCV calls there, e.g.:
-
-  .. code-block:: java
-
-    import org.opencv.core.Core;
-    import org.opencv.core.CvType;
-    import org.opencv.core.Mat;
-
-    public class Main {
-        public static void main(String[] args) {
-            System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
-            Mat m  = Mat.eye(3, 3, CvType.CV_8UC1);
-            System.out.println("m = " + m.dump());
-        }
-    }
-
-* Press :guilabel:`Run` button and find the identity matrix content in the Eclipse ``Console`` window.
-
-  .. image:: images/eclipse_run.png
-     :alt: Eclipse: run
-     :align: center
 
 SBT project for Java and Scala
 ==============================
diff --git a/doc/tutorials/introduction/java_eclipse/images/1-window-preferences.png b/doc/tutorials/introduction/java_eclipse/images/1-window-preferences.png
new file mode 100644
index 000000000..53b3631fd
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/1-window-preferences.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/10-new-project-created.png b/doc/tutorials/introduction/java_eclipse/images/10-new-project-created.png
new file mode 100644
index 000000000..63ef65f14
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/10-new-project-created.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/11-the-code.png b/doc/tutorials/introduction/java_eclipse/images/11-the-code.png
new file mode 100644
index 000000000..917e76b8e
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/11-the-code.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/2-user-library-new.png b/doc/tutorials/introduction/java_eclipse/images/2-user-library-new.png
new file mode 100644
index 000000000..e65faef2d
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/2-user-library-new.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/3-library-name.png b/doc/tutorials/introduction/java_eclipse/images/3-library-name.png
new file mode 100644
index 000000000..bc79cde80
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/3-library-name.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/4-add-external-jars.png b/doc/tutorials/introduction/java_eclipse/images/4-add-external-jars.png
new file mode 100644
index 000000000..14dfa6247
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/4-add-external-jars.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/5-native-library.png b/doc/tutorials/introduction/java_eclipse/images/5-native-library.png
new file mode 100644
index 000000000..a0a4962e9
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/5-native-library.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/6-external-folder.png b/doc/tutorials/introduction/java_eclipse/images/6-external-folder.png
new file mode 100644
index 000000000..8ef7b32a3
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/6-external-folder.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/7-user-library-final.png b/doc/tutorials/introduction/java_eclipse/images/7-user-library-final.png
new file mode 100644
index 000000000..67b15daba
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/7-user-library-final.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/7_5-new-java-project.png b/doc/tutorials/introduction/java_eclipse/images/7_5-new-java-project.png
new file mode 100644
index 000000000..8aaef5d4e
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/7_5-new-java-project.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/8-add-library.png b/doc/tutorials/introduction/java_eclipse/images/8-add-library.png
new file mode 100644
index 000000000..b13c65c7a
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/8-add-library.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/images/9-select-user-lib.png b/doc/tutorials/introduction/java_eclipse/images/9-select-user-lib.png
new file mode 100644
index 000000000..1c80e9821
Binary files /dev/null and b/doc/tutorials/introduction/java_eclipse/images/9-select-user-lib.png differ
diff --git a/doc/tutorials/introduction/java_eclipse/java_eclipse.rst b/doc/tutorials/introduction/java_eclipse/java_eclipse.rst
new file mode 100644
index 000000000..bc2247da9
--- /dev/null
+++ b/doc/tutorials/introduction/java_eclipse/java_eclipse.rst
@@ -0,0 +1,110 @@
+
+.. _Java_Eclipse:
+
+
+Using OpenCV Java with Eclipse
+*********************************************
+
+Since version 2.4.4 `OpenCV supports Java <http://opencv.org/opencv-java-api.html>`_. In this tutorial I will explain how to setup development environment for using OpenCV Java with Eclipse in **Windows**, so you can enjoy the benefits of garbage collected, very refactorable (rename variable, extract method and whatnot) modern language that enables you to write code with less effort and make less mistakes. Here we go.
+
+
+Configuring Eclipse
+===================
+
+First, obtain a fresh release of OpenCV `from download page <http://opencv.org/downloads.html>`_ and extract it under a simple location like ``C:\OpenCV-2.4.6\``. I am using version 2.4.6, but the steps are more or less the same for other versions.
+
+Now, we will define OpenCV as a user library in Eclipse, so we can reuse the configuration for any project. Launch Eclipse and select :guilabel:`Window --> Preferences` from the menu.
+
+.. image:: images/1-window-preferences.png
+     :alt: Eclipse preferences
+     :align: center
+
+Navigate under :guilabel:`Java --> Build Path --> User Libraries` and click :guilabel:`New...`.
+
+.. image:: images/2-user-library-new.png
+     :alt: Creating a new library
+     :align: center
+
+Enter a name, e.g. ``OpenCV-2.4.6``, for your new library.
+
+.. image:: images/3-library-name.png
+     :alt: Naming the new library
+     :align: center
+
+Now select your new user library and click :guilabel:`Add External JARs...`.
+
+.. image:: images/4-add-external-jars.png
+     :alt: Adding external jar
+     :align: center
+
+Browse through ``C:\OpenCV-2.4.6\build\java\`` and select ``opencv-246.jar``. After adding the jar, extend the :guilabel:`opencv-246.jar` and select :guilabel:`Native library location` and press :guilabel:`Edit...`.
+
+.. image:: images/5-native-library.png
+     :alt: Selecting native library location 1
+     :align: center
+
+Select :guilabel:`External Folder...` and browse to select the folder ``C:\OpenCV-2.4.6\build\java\x64``. If you have a 32-bit system you need to select the ``x86`` folder instead of ``x64``.
+
+.. image:: images/6-external-folder.png
+     :alt: Selecting native library location 2
+     :align: center
+
+Your user library configuration should look like this:
+
+.. image:: images/7-user-library-final.png
+     :alt: Selecting native library location 2
+     :align: center
+
+
+Testing the configuration on a new Java project
+=====================================================
+
+Now start creating a new Java project.
+
+.. image:: images/7_5-new-java-project.png
+     :alt: Creating new Java project
+     :align: center
+
+On the :guilabel:`Java Settings` step, under :guilabel:`Libraries` tab, select :guilabel:`Add Library...` and select :guilabel:`OpenCV-2.4.6`, then click :guilabel:`Finish`.
+
+.. image:: images/8-add-library.png
+     :alt: Adding user defined library 1
+     :align: center
+
+.. image:: images/9-select-user-lib.png
+     :alt: Adding user defined library 2
+     :align: center
+
+
+Libraries should look like this:
+
+.. image:: images/10-new-project-created.png
+     :alt: Adding user defined library
+     :align: center
+
+
+Now you have created and configured a new Java project it is time to test it. Create a new java file. Here is a starter code for your convenience:
+
+.. code-block:: java
+
+   import org.opencv.core.Core;
+   import org.opencv.core.CvType;
+   import org.opencv.core.Mat;
+
+   public class Hello
+   {
+      public static void main( String[] args )
+      {
+         System.loadLibrary( Core.NATIVE_LIBRARY_NAME );
+         Mat mat = Mat.eye( 3, 3, CvType.CV_8UC1 );
+         System.out.println( "mat = " + mat.dump() );
+      }
+   }
+
+When you run the code you should see 3x3 identity matrix as output.
+
+.. image:: images/11-the-code.png
+     :alt: Adding user defined library
+     :align: center
+
+That is it, whenever you start a new project just add the OpenCV user library that you have defined to your project and you are good to go. Enjoy your powerful, less painful development environment :)
\ No newline at end of file
diff --git a/doc/tutorials/introduction/table_of_content_introduction/images/eclipse-logo.png b/doc/tutorials/introduction/table_of_content_introduction/images/eclipse-logo.png
new file mode 100644
index 000000000..64ec01c25
Binary files /dev/null and b/doc/tutorials/introduction/table_of_content_introduction/images/eclipse-logo.png differ
diff --git a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
index 2238b8787..ee8dd67ef 100644
--- a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
+++ b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
@@ -138,6 +138,24 @@ world of the OpenCV.
                         :height: 90pt
                         :width:  90pt
 
+  .. tabularcolumns:: m{100pt} m{300pt}
+  .. cssclass:: toctableopencv
+
+  ================ =================================================
+  |EclipseLogo|    **Title:** :ref:`Java_Eclipse`
+
+                   *Compatibility:* > OpenCV 2.4.4
+
+                   *Author:* |Author_BarisD|
+
+                   A tutorial on how to use OpenCV Java with Eclipse.
+
+  ================ =================================================
+
+     .. |EclipseLogo| image:: images/eclipse-logo.png
+                        :height: 90pt
+                        :width:  90pt
+
 * **Android**
 
   .. tabularcolumns:: m{100pt} m{300pt}
@@ -295,6 +313,7 @@ world of the OpenCV.
    ../windows_visual_studio_Opencv/windows_visual_studio_Opencv
    ../windows_visual_studio_image_watch/windows_visual_studio_image_watch
    ../desktop_java/java_dev_intro
+   ../java_eclipse/java_eclipse
    ../android_binary_package/android_dev_intro
    ../android_binary_package/O4A_SDK
    ../android_binary_package/dev_with_OCV_on_Android
diff --git a/modules/bioinspired/doc/retina/index.rst b/modules/bioinspired/doc/retina/index.rst
index 242416baa..fd487b7f9 100644
--- a/modules/bioinspired/doc/retina/index.rst
+++ b/modules/bioinspired/doc/retina/index.rst
@@ -110,8 +110,8 @@ Here is an overview of the abstract Retina interface, allocate one instance with
 
 .. Sample code::
 
-   * An example on retina tone mapping can be found at opencv_source_code/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp
-   * An example on retina tone mapping on video input can be found at opencv_source_code/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp
+   * An example on retina tone mapping can be found at opencv_source_code/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
+   * An example on retina tone mapping on video input can be found at opencv_source_code/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
    * A complete example illustrating the retina interface can be found at opencv_source_code/samples/cpp/retinaDemo.cpp
 
 Description
@@ -182,13 +182,13 @@ Take a look at the provided C++ examples provided with OpenCV :
    **Note :** This demo generates the file *RetinaDefaultParameters.xml* which contains the default parameters of the retina. Then, rename this as *RetinaSpecificParameters.xml*, adjust the parameters the way you want and reload the program to check the effect.
 
 
-* **samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp** shows how to use the retina to perform High Dynamic Range (HDR) luminance compression
+* **samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp** shows how to use the retina to perform High Dynamic Range (HDR) luminance compression
 
    Then, take a HDR image using bracketing with your camera and generate an OpenEXR image and then process it using the demo.
 
    Typical use, supposing that you have the OpenEXR image such as *memorial.exr* (present in the samples/cpp/ folder)
 
-   **OpenCVReleaseFolder/bin/OpenEXRimages_HighDynamicRange_Retina_toneMapping memorial.exr [optionnal: 'fast']**
+   **OpenCVReleaseFolder/bin/OpenEXRimages_HDR_Retina_toneMapping memorial.exr [optional: 'fast']**
 
       Note that some sliders are made available to allow you to play with luminance compression.
 
diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index ca8413ee2..e21f90eec 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1144,7 +1144,7 @@ CVAPI(void)   cvSetRemove( CvSet* set_header, int index );
    NULL is returned */
 CV_INLINE CvSetElem* cvGetSetElem( const CvSet* set_header, int idx )
 {
-    CvSetElem* elem = (CvSetElem*)cvGetSeqElem( (CvSeq*)set_header, idx );
+    CvSetElem* elem = (CvSetElem*)(void *)cvGetSeqElem( (CvSeq*)set_header, idx );
     return elem && CV_IS_SET_ELEM( elem ) ? elem : 0;
 }
 
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 026ab695d..671ac60a9 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -1734,13 +1734,13 @@ const _Tp& SparseMat::value(const Node* n) const
 inline
 SparseMat::Node* SparseMat::node(size_t nidx)
 {
-    return (Node*)&hdr->pool[nidx];
+    return (Node*)(void*)&hdr->pool[nidx];
 }
 
 inline
 const SparseMat::Node* SparseMat::node(size_t nidx) const
 {
-    return (const Node*)&hdr->pool[nidx];
+    return (const Node*)(const void*)&hdr->pool[nidx];
 }
 
 inline
@@ -2488,7 +2488,7 @@ const _Tp& SparseMatConstIterator::value() const
 inline
 const SparseMat::Node* SparseMatConstIterator::node() const
 {
-    return (ptr && m && m->hdr) ? (const SparseMat::Node*)(ptr - m->hdr->valueOffset) : 0;
+    return (ptr && m && m->hdr) ? (const SparseMat::Node*)(const void*)(ptr - m->hdr->valueOffset) : 0;
 }
 
 inline
diff --git a/modules/core/include/opencv2/core/persistence.hpp b/modules/core/include/opencv2/core/persistence.hpp
index f5687614b..8f515c54d 100644
--- a/modules/core/include/opencv2/core/persistence.hpp
+++ b/modules/core/include/opencv2/core/persistence.hpp
@@ -861,8 +861,8 @@ inline FileNode::operator String() const { String value; read(*this, value, valu
 inline FileNodeIterator FileNode::begin() const { return FileNodeIterator(fs, node); }
 inline FileNodeIterator FileNode::end() const   { return FileNodeIterator(fs, node, size()); }
 inline void FileNode::readRaw( const String& fmt, uchar* vec, size_t len ) const { begin().readRaw( fmt, vec, len ); }
-inline FileNode FileNodeIterator::operator *() const  { return FileNode(fs, (const CvFileNode*)reader.ptr); }
-inline FileNode FileNodeIterator::operator ->() const { return FileNode(fs, (const CvFileNode*)reader.ptr); }
+inline FileNode FileNodeIterator::operator *() const  { return FileNode(fs, (const CvFileNode*)(const void*)reader.ptr); }
+inline FileNode FileNodeIterator::operator ->() const { return FileNode(fs, (const CvFileNode*)(const void*)reader.ptr); }
 inline String::String(const FileNode& fn): cstr_(0), len_(0) { read(fn, *this, *this); }
 
 } // cv
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index e4c8cb5a6..4555efe6b 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -523,11 +523,11 @@ CV_INLINE  double  cvmGet( const CvMat* mat, int row, int col )
             (unsigned)col < (unsigned)mat->cols );
 
     if( type == CV_32FC1 )
-        return ((float*)(mat->data.ptr + (size_t)mat->step*row))[col];
+        return ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
     else
     {
         assert( type == CV_64FC1 );
-        return ((double*)(mat->data.ptr + (size_t)mat->step*row))[col];
+        return ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
     }
 }
 
@@ -540,11 +540,11 @@ CV_INLINE  void  cvmSet( CvMat* mat, int row, int col, double value )
             (unsigned)col < (unsigned)mat->cols );
 
     if( type == CV_32FC1 )
-        ((float*)(mat->data.ptr + (size_t)mat->step*row))[col] = (float)value;
+        ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = (float)value;
     else
     {
         assert( type == CV_64FC1 );
-        ((double*)(mat->data.ptr + (size_t)mat->step*row))[col] = (double)value;
+        ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = (double)value;
     }
 }
 
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index bafc1d072..dae38282a 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -238,6 +238,7 @@ template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_
 */
 static inline size_t alignSize(size_t sz, int n)
 {
+    CV_DbgAssert((n & (n - 1)) == 0); // n is a power of 2
     return (sz + n-1) & -n;
 }
 
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index a802868df..e6fed4eae 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -50,7 +50,7 @@ namespace cv
 # pragma warning(disable: 4748)
 #endif
 
-#if defined HAVE_IPP && IPP_VERSION_MAJOR >= 7
+#if defined HAVE_IPP && IPP_VERSION_MAJOR*100 + IPP_VERSION_MINOR >= 701
 #define USE_IPP_DFT 1
 #else
 #undef USE_IPP_DFT
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index bf0788e6b..f24579ca2 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1610,7 +1610,8 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
     int depth = src.depth(), cn = src.channels();
 
     normType &= 7;
-    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
+               normType == NORM_L2 || normType == NORM_L2SQR ||
                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src.type() == CV_8U) );
 
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
@@ -1981,7 +1982,8 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
     CV_Assert( src1.size == src2.size && src1.type() == src2.type() );
 
     normType &= 7;
-    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
+               normType == NORM_L2 || normType == NORM_L2SQR ||
               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
 
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
diff --git a/modules/flann/include/opencv2/flann/lsh_table.h b/modules/flann/include/opencv2/flann/lsh_table.h
index 9b3ac0991..2c99a3aee 100644
--- a/modules/flann/include/opencv2/flann/lsh_table.h
+++ b/modules/flann/include/opencv2/flann/lsh_table.h
@@ -384,7 +384,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons
 {
     // no need to check if T is dividable by sizeof(size_t) like in the Hamming
     // distance computation as we have a mask
-    const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
+    const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
 
     // Figure out the subsignature of the feature
     // Given the feature ABCDEF, and the mask 001011, the output will be
diff --git a/modules/highgui/src/cap_dc1394.cpp b/modules/highgui/src/cap_dc1394.cpp
index 3e628acb7..9706bd12e 100644
--- a/modules/highgui/src/cap_dc1394.cpp
+++ b/modules/highgui/src/cap_dc1394.cpp
@@ -296,11 +296,7 @@ static CvCaptureCAM_DC1394 * icvCaptureFromCAM_DC1394 (int index)
     if (pcap->format!=FORMAT_SCALABLE_IMAGE_SIZE) { // everything except Format 7
         if (dc1394_dma_setup_capture(pcap->handle, pcap->camera->node, index+1 /*channel*/,
                     pcap->format, pcap->mode, SPEED_400,
-                    pcap->frame_rate, NUM_BUFFERS,
-#ifdef HAVE_DC1394_095
-                    0 /*do_extra_buffering*/,
-#endif
-                    1 /*DROP_FRAMES*/,
+                    pcap->frame_rate, NUM_BUFFERS, 1 /*drop_frames*/,
                     pcap->device_name, pcap->camera) != DC1394_SUCCESS) {
             fprintf(stderr,"%s:%d: Failed to setup DMA capture with VIDEO1394\n",__FILE__,__LINE__);
             goto ERROR;
@@ -311,11 +307,7 @@ static CvCaptureCAM_DC1394 * icvCaptureFromCAM_DC1394 (int index)
                     pcap->mode, SPEED_400, QUERY_FROM_CAMERA,
                     (unsigned int)QUERY_FROM_CAMERA, (unsigned int)QUERY_FROM_CAMERA,
                     (unsigned int)QUERY_FROM_CAMERA, (unsigned int)QUERY_FROM_CAMERA,
-                    NUM_BUFFERS,
-#ifdef HAVE_DC1394_095
-                    0 /*do_extra_buffering*/,
-#endif
-                    1 /*DROP_FRAMES*/,
+                    NUM_BUFFERS, 1 /*drop_frames*/,
                     pcap->device_name, pcap->camera) != DC1394_SUCCESS) {
             fprintf(stderr,"%s:%d: Failed to setup DMA capture with VIDEO1394\n",__FILE__,__LINE__);
             goto ERROR;
@@ -661,11 +653,7 @@ icvSetModeCAM_DC1394( CvCaptureCAM_DC1394 * capture, int mode ){
     dc1394_dma_unlisten(capture->handle, capture->camera);
     if (dc1394_dma_setup_capture(capture->handle, capture->camera->node, capture->camera->channel /*channel*/,
                 format, mode, SPEED_400,
-                frame_rate, NUM_BUFFERS,
-#ifdef HAVE_DC1394_095
-                0 /*do_extra_buffering*/,
-#endif
-                1 /*DROP_FRAMES*/,
+                frame_rate, NUM_BUFFERS, 1 /*drop_frames*/,
                 capture->device_name, capture->camera) != DC1394_SUCCESS) {
         fprintf(stderr,"%s:%d: Failed to setup DMA capture with VIDEO1394\n",__FILE__,__LINE__);
         return 0;
diff --git a/modules/highgui/src/cap_qtkit.mm b/modules/highgui/src/cap_qtkit.mm
index 8a7b3d84e..2bc82b974 100644
--- a/modules/highgui/src/cap_qtkit.mm
+++ b/modules/highgui/src/cap_qtkit.mm
@@ -287,11 +287,17 @@ bool CvCaptureCAM::grabFrame(double timeOut) {
     double sleepTime = 0.005;
     double total = 0;
 
-    NSDate *loopUntil = [NSDate dateWithTimeIntervalSinceNow:sleepTime];
-    while (![capture updateImage] && (total += sleepTime)<=timeOut &&
-           [[NSRunLoop currentRunLoop] runMode: NSDefaultRunLoopMode
-                                    beforeDate:loopUntil])
-        loopUntil = [NSDate dateWithTimeIntervalSinceNow:sleepTime];
+    // If the capture is launched in a separate thread, then
+    // [NSRunLoop currentRunLoop] is not the same as in the main thread, and has no timer.
+    //see https://developer.apple.com/library/mac/#documentation/Cocoa/Reference/Foundation/Classes/nsrunloop_Class/Reference/Reference.html
+    // "If no input sources or timers are attached to the run loop, this
+    // method exits immediately"
+    // using usleep() is not a good alternative, because it may block the GUI.
+    // Create a dummy timer so that runUntilDate does not exit immediately:
+    [NSTimer scheduledTimerWithTimeInterval:100 target:nil selector:@selector(doFireTimer:) userInfo:nil repeats:YES];
+    while (![capture updateImage] && (total += sleepTime)<=timeOut) {
+        [[NSRunLoop currentRunLoop] runUntilDate:[NSDate dateWithTimeIntervalSinceNow:sleepTime]];
+    }
 
     [localpool drain];
 
@@ -336,9 +342,11 @@ int CvCaptureCAM::startCaptureDevice(int cameraNum) {
     }
 
     if (cameraNum >= 0) {
-        int nCameras = [devices count];
-        if( cameraNum < 0 || cameraNum >= nCameras )
+        NSUInteger nCameras = [devices count];
+        if( (NSUInteger)cameraNum >= nCameras ) {
+            [localpool drain];
             return 0;
+        }
         device = [devices objectAtIndex:cameraNum] ;
     } else {
         device = [QTCaptureDevice defaultInputDeviceWithMediaType:QTMediaTypeVideo]  ;
@@ -402,6 +410,7 @@ int CvCaptureCAM::startCaptureDevice(int cameraNum) {
 
         grabFrame(60);
 
+        [localpool drain];
         return 1;
     }
 
@@ -431,6 +440,7 @@ void CvCaptureCAM::setWidthHeight() {
 
 
 double CvCaptureCAM::getProperty(int property_id){
+    int retval;
     NSAutoreleasePool* localpool = [[NSAutoreleasePool alloc] init];
 
     NSArray* connections = [mCaptureDeviceInput	connections];
@@ -440,15 +450,18 @@ double CvCaptureCAM::getProperty(int property_id){
     int width=s1.width, height=s1.height;
     switch (property_id) {
         case CV_CAP_PROP_FRAME_WIDTH:
-            return width;
+            retval = width;
+            break;
         case CV_CAP_PROP_FRAME_HEIGHT:
-            return height;
+            retval = height;
+            break;
         default:
-            return 0;
+            retval = 0;
+            break;
     }
 
     [localpool drain];
-
+    return retval;
 }
 
 bool CvCaptureCAM::setProperty(int property_id, double value) {
@@ -496,13 +509,15 @@ bool CvCaptureCAM::setProperty(int property_id, double value) {
 @implementation CaptureDelegate
 
 - (id)init {
-    [super init];
-    newFrame = 0;
-    imagedata = NULL;
-    bgr_imagedata = NULL;
-    currSize = 0;
-    image = NULL;
-    bgr_image = NULL;
+    self = [super init];
+    if (self) {
+        newFrame = 0;
+        imagedata = NULL;
+        bgr_imagedata = NULL;
+        currSize = 0;
+        image = NULL;
+        bgr_image = NULL;
+    }
     return self;
 }
 
@@ -577,26 +592,26 @@ didDropVideoFrameWithSampleBuffer:(QTSampleBuffer *)sampleBuffer
         memcpy(imagedata, baseaddress, currSize);
 
         if (image == NULL) {
-            image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 4);
+            image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 4);
         }
-        image->width =width;
-        image->height = height;
+        image->width = (int)width;
+        image->height = (int)height;
         image->nChannels = 4;
         image->depth = IPL_DEPTH_8U;
-        image->widthStep = rowBytes;
+        image->widthStep = (int)rowBytes;
         image->imageData = imagedata;
-        image->imageSize = currSize;
+        image->imageSize = (int)currSize;
 
         if (bgr_image == NULL) {
-            bgr_image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 3);
+            bgr_image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 3);
         }
-        bgr_image->width =width;
-        bgr_image->height = height;
+        bgr_image->width = (int)width;
+        bgr_image->height = (int)height;
         bgr_image->nChannels = 3;
         bgr_image->depth = IPL_DEPTH_8U;
-        bgr_image->widthStep = rowBytes;
+        bgr_image->widthStep = (int)rowBytes;
         bgr_image->imageData = bgr_imagedata;
-        bgr_image->imageSize = currSize;
+        bgr_image->imageSize = (int)currSize;
 
         cvCvtColor(image, bgr_image, CV_BGRA2BGR);
 
@@ -750,29 +765,29 @@ IplImage* CvCaptureFile::retrieveFramePixelBuffer() {
         }
 
         if (image == NULL) {
-            image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 4);
+            image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 4);
         }
 
-        image->width =width;
-        image->height = height;
+        image->width = (int)width;
+        image->height = (int)height;
         image->nChannels = 4;
         image->depth = IPL_DEPTH_8U;
-        image->widthStep = rowBytes;
+        image->widthStep = (int)rowBytes;
         image->imageData = imagedata;
-        image->imageSize = currSize;
+        image->imageSize = (int)currSize;
 
 
         if (bgr_image == NULL) {
-            bgr_image = cvCreateImageHeader(cvSize(width,height), IPL_DEPTH_8U, 3);
+            bgr_image = cvCreateImageHeader(cvSize((int)width,(int)height), IPL_DEPTH_8U, 3);
         }
 
-        bgr_image->width =width;
-        bgr_image->height = height;
+        bgr_image->width = (int)width;
+        bgr_image->height = (int)height;
         bgr_image->nChannels = 3;
         bgr_image->depth = IPL_DEPTH_8U;
-        bgr_image->widthStep = rowBytes;
+        bgr_image->widthStep = (int)rowBytes;
         bgr_image->imageData = bgr_imagedata;
-        bgr_image->imageSize = currSize;
+        bgr_image->imageSize = (int)currSize;
 
         cvCvtColor(image, bgr_image,CV_BGRA2BGR);
 
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index f7e7efa85..fbf1de247 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -46,6 +46,12 @@
                                     Base Image Filter
 \****************************************************************************************/
 
+#if defined HAVE_IPP && IPP_VERSION_MAJOR*100 + IPP_VERSION_MINOR >= 701
+#define USE_IPP_SEP_FILTERS 1
+#else
+#undef USE_IPP_SEP_FILTERS
+#endif
+
 namespace cv
 {
 
@@ -1401,21 +1407,53 @@ struct RowVec_32f
     RowVec_32f( const Mat& _kernel )
     {
         kernel = _kernel;
+        haveSSE = checkHardwareSupport(CV_CPU_SSE);
+#ifdef USE_IPP_SEP_FILTERS
+        bufsz = -1;
+#endif
     }
 
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
-        if( !checkHardwareSupport(CV_CPU_SSE) )
-            return 0;
-
-        int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
+        int _ksize = kernel.rows + kernel.cols - 1;
+        const float* src0 = (const float*)_src;
         float* dst = (float*)_dst;
         const float* _kx = (const float*)kernel.data;
+
+#ifdef USE_IPP_SEP_FILTERS
+        IppiSize roisz = { width, 1 };
+        if( (cn == 1 || cn == 3) && width >= _ksize*8 )
+        {
+            if( bufsz < 0 )
+            {
+                if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) ||
+                    (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0))
+                    return 0;
+            }
+            AutoBuffer<uchar> buf(bufsz + 64);
+            uchar* bufptr = alignPtr((uchar*)buf, 32);
+            int step = (int)(width*sizeof(dst[0])*cn);
+            float borderValue[] = {0.f, 0.f, 0.f};
+            // here is the trick. IPP needs border type and extrapolates the row. We did it already.
+            // So we pass anchor=0 and ignore the right tail of results since they are incorrect there.
+            if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src0, step, &dst, roisz, _kx, _ksize, 0,
+                                                                ippBorderRepl, borderValue[0], bufptr) < 0) ||
+                (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src0, step, &dst, roisz, _kx, _ksize, 0,
+                                                                ippBorderRepl, borderValue, bufptr) < 0))
+                return 0;
+            return width - _ksize + 1;
+        }
+#endif
+
+        if( !haveSSE )
+            return 0;
+
+        int i = 0, k;
         width *= cn;
 
         for( ; i <= width - 8; i += 8 )
         {
-            const float* src = (const float*)_src + i;
+            const float* src = src0 + i;
             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
             for( k = 0; k < _ksize; k++, src += cn )
             {
@@ -1434,6 +1472,10 @@ struct RowVec_32f
     }
 
     Mat kernel;
+    bool haveSSE;
+#ifdef USE_IPP_SEP_FILTERS
+    mutable int bufsz;
+#endif
 };
 
 
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 9e2048407..7ce02c38e 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1880,12 +1880,10 @@ public:
           IppiRect dstroi = { 0, dsty, dstwidth, dstheight - dsty };
           int bufsize;
           ippiResizeGetBufSize( srcroi, dstroi, cn, mode, &bufsize );
-          Ipp8u *buf;
-          buf = ippsMalloc_8u( bufsize );
-          IppStatus sts;
-          if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, buf ) < 0 )
+          AutoBuffer<uchar> buf(bufsize + 64);
+          uchar* bufptr = alignPtr((uchar*)buf, 32);
+          if( func( src.data, ippiSize(src.cols, src.rows), (int)src.step[0], srcroi, dst.data, (int)dst.step[0], dstroi, inv_scale_x, inv_scale_y, 0, 0, mode, bufptr ) < 0 )
               *ok = false;
-          ippsFree(buf);
       }
 private:
     Mat &src;
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index ad10aafc5..3dfae1238 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -193,7 +193,7 @@ if(ANDROID AND ANDROID_EXECUTABLE)
   set(lib_target_files ${ANDROID_LIB_PROJECT_FILES})
   ocv_list_add_prefix(lib_target_files "${OpenCV_BINARY_DIR}/")
 
-  android_get_compatible_target(lib_target_sdk_target ${ANDROID_NATIVE_API_LEVEL} ${ANDROID_SDK_TARGET} 11)
+  android_get_compatible_target(lib_target_sdk_target ${ANDROID_NATIVE_API_LEVEL} ${ANDROID_SDK_TARGET} 14)
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/android_lib/${ANDROID_MANIFEST_FILE}" "${CMAKE_CURRENT_BINARY_DIR}/${ANDROID_MANIFEST_FILE}" @ONLY)
 
   add_custom_command(OUTPUT ${lib_target_files} "${OpenCV_BINARY_DIR}/${ANDROID_MANIFEST_FILE}"
diff --git a/modules/java/generator/src/java/android+JavaCameraView.java b/modules/java/generator/src/java/android+JavaCameraView.java
index f864e5370..0acd85c19 100644
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@@ -146,6 +146,9 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                     Log.d(TAG, "Set preview size to " + Integer.valueOf((int)frameSize.width) + "x" + Integer.valueOf((int)frameSize.height));
                     params.setPreviewSize((int)frameSize.width, (int)frameSize.height);
 
+                    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.ICE_CREAM_SANDWICH)
+                        params.setRecordingHint(true);
+
                     List<String> FocusModes = params.getSupportedFocusModes();
                     if (FocusModes != null && FocusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_VIDEO))
                     {
diff --git a/modules/legacy/src/dpstereo.cpp b/modules/legacy/src/dpstereo.cpp
index 838cf8ae0..7b7ebd6d3 100644
--- a/modules/legacy/src/dpstereo.cpp
+++ b/modules/legacy/src/dpstereo.cpp
@@ -76,8 +76,8 @@ typedef struct _CvRightImData
     uchar min_val, max_val;
 } _CvRightImData;
 
-#define CV_IMAX3(a,b,c) ((temp2 = (a) >= (b) ? (a) : (b)),(temp2 >= (c) ? temp2 : (c)))
-#define CV_IMIN3(a,b,c) ((temp3 = (a) <= (b) ? (a) : (b)),(temp3 <= (c) ? temp3 : (c)))
+#define CV_IMAX3(a,b,c) (std::max(std::max((a), (b)), (c)))
+#define CV_IMIN3(a,b,c) (std::min(std::min((a), (b)), (c)))
 
 static void icvFindStereoCorrespondenceByBirchfieldDP( uchar* src1, uchar* src2,
                                                 uchar* disparities,
@@ -87,7 +87,7 @@ static void icvFindStereoCorrespondenceByBirchfieldDP( uchar* src1, uchar* src2,
                                                 float  _param3, float _param4,
                                                 float  _param5 )
 {
-    int     x, y, i, j, temp2, temp3;
+    int     x, y, i, j;
     int     d, s;
     int     dispH =  maxDisparity + 3;
     uchar  *dispdata;
diff --git a/modules/ml/src/nbayes.cpp b/modules/ml/src/nbayes.cpp
index b9a966cd1..5ad1b134d 100644
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -210,6 +210,8 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
                 prod_data[c2] += train_vec[c2]*val1;
         }
     }
+    cvReleaseMat( &responses );
+    responses = 0;
 
     /* calculate avg, covariance matrix, c */
     for( cls = 0; cls < nclasses; cls++ )
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index 698168f0b..42576fa25 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -81,11 +81,6 @@ namespace cv
     }
 }
 
-static inline size_t divUp(size_t total, size_t grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 static inline int calcSize(int octave, int layer)
 {
     /* Wavelet size at first layer of first octave. */
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
index 05b28b83f..69d9df52d 100644
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -3,5 +3,5 @@ if(NOT HAVE_OPENCL)
 endif()
 
 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_calib3d)
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_calib3d opencv_ml)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
diff --git a/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst b/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst
new file mode 100644
index 000000000..96ed6bbad
--- /dev/null
+++ b/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst
@@ -0,0 +1,334 @@
+Camera Calibration and 3D Reconstruction
+========================================
+
+.. highlight:: cpp
+
+
+
+ocl::StereoBM_OCL
+---------------------
+.. ocv:class:: ocl::StereoBM_OCL
+
+Class computing stereo correspondence (disparity map) using the block matching algorithm. ::
+
+    class CV_EXPORTS StereoBM_OCL
+    {
+    public:
+        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+
+        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
+
+        //! the default constructor
+        StereoBM_OCL();
+        //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
+        StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
+
+        //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
+        //! Output disparity has CV_8U type.
+        void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
+
+        //! Some heuristics that tries to estmate
+        // if current GPU will be faster then CPU in this algorithm.
+        // It queries current active device.
+        static bool checkIfGpuCallReasonable();
+
+        int preset;
+        int ndisp;
+        int winSize;
+
+        // If avergeTexThreshold  == 0 => post procesing is disabled
+        // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
+        // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
+        // i.e. input left image is low textured.
+        float avergeTexThreshold;
+    private:
+        /* hidden */
+    };
+
+
+The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
+
+.. math::
+    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
+
+This means that the input left image is low textured.
+
+
+ocl::StereoBM_OCL::StereoBM_OCL
+-----------------------------------
+Enables :ocv:class:`ocl::StereoBM_OCL` constructors.
+
+.. ocv:function:: ocl::StereoBM_OCL::StereoBM_OCL()
+
+.. ocv:function:: ocl::StereoBM_OCL::StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
+
+    :param preset: Parameter presetting:
+
+        * **BASIC_PRESET** Basic mode without pre-processing.
+
+        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
+
+    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
+
+    :param winSize: Block size.
+
+
+
+ocl::StereoBM_OCL::operator ()
+----------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
+
+.. ocv:function:: void ocl::StereoBM_OCL::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
+
+    :param left: Left image. Only  ``CV_8UC1``  type is supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::StereoBM_OCL::checkIfGpuCallReasonable
+-----------------------------------------------
+Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
+
+.. ocv:function:: bool ocl::StereoBM_OCL::checkIfGpuCallReasonable()
+
+ocl::StereoBeliefPropagation
+--------------------------------
+.. ocv:class:: ocl::StereoBeliefPropagation
+
+Class computing stereo correspondence using the belief propagation algorithm. ::
+
+    class CV_EXPORTS StereoBeliefPropagation
+    {
+    public:
+        enum { DEFAULT_NDISP  = 64 };
+        enum { DEFAULT_ITERS  = 5  };
+        enum { DEFAULT_LEVELS = 5  };
+        static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
+        explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
+                                         int iters  = DEFAULT_ITERS,
+                                         int levels = DEFAULT_LEVELS,
+                                         int msg_type = CV_16S);
+        StereoBeliefPropagation(int ndisp, int iters, int levels,
+                                float max_data_term, float data_weight,
+                                float max_disc_term, float disc_single_jump,
+                                int msg_type = CV_32F);
+        void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+        void operator()(const oclMat &data, oclMat &disparity);
+        int ndisp;
+        int iters;
+        int levels;
+        float max_data_term;
+        float data_weight;
+        float max_disc_term;
+        float disc_single_jump;
+        int msg_type;
+    private:
+        /* hidden */
+    };
+
+The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.
+
+.. note::
+
+    ``StereoBeliefPropagation`` requires a lot of memory for message storage:
+
+    .. math::
+
+        width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)
+
+    and for data cost storage:
+
+    .. math::
+
+        width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})
+
+    ``width_step`` is the number of bytes in a line including padding.
+
+
+
+ocl::StereoBeliefPropagation::StereoBeliefPropagation
+---------------------------------------------------------
+Enables the :ocv:class:`ocl::StereoBeliefPropagation` constructors.
+
+.. ocv:function:: ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_16S)
+
+.. ocv:function:: ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param max_data_term: Threshold for data cost truncation.
+
+    :param data_weight: Data weight.
+
+    :param max_disc_term: Threshold for discontinuity truncation.
+
+    :param disc_single_jump: Discontinuity single jump.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:
+
+.. math::
+
+    DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)
+
+.. math::
+
+    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
+
+For more details, see [Felzenszwalb2006]_.
+
+By default, :ocv:class:`ocl::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+
+.. math::
+
+    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
+
+
+
+ocl::StereoBeliefPropagation::estimateRecommendedParams
+-----------------------------------------------------------
+Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).
+
+.. ocv:function:: void ocl::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
+
+
+
+ocl::StereoBeliefPropagation::operator ()
+---------------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
+
+.. ocv:function:: void ocl::StereoBeliefPropagation::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
+
+.. ocv:function:: void ocl::StereoBeliefPropagation::operator ()(const oclMat& data, oclMat& disparity)
+
+    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.
+
+    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the type is retained.
+
+    :param stream: Stream for the asynchronous version.
+
+ocl::StereoConstantSpaceBP
+------------------------------
+.. ocv:class:: ocl::StereoConstantSpaceBP
+
+Class computing stereo correspondence using the constant space belief propagation algorithm. ::
+
+    class CV_EXPORTS StereoConstantSpaceBP
+    {
+    public:
+        enum { DEFAULT_NDISP    = 128 };
+        enum { DEFAULT_ITERS    = 8   };
+        enum { DEFAULT_LEVELS   = 4   };
+        enum { DEFAULT_NR_PLANE = 4   };
+        static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
+        explicit StereoConstantSpaceBP(
+            int ndisp    = DEFAULT_NDISP,
+            int iters    = DEFAULT_ITERS,
+            int levels   = DEFAULT_LEVELS,
+            int nr_plane = DEFAULT_NR_PLANE,
+            int msg_type = CV_32F);
+        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
+            float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
+            int min_disp_th = 0,
+            int msg_type = CV_32F);
+        void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+        int ndisp;
+        int iters;
+        int levels;
+        int nr_plane;
+        float max_data_term;
+        float data_weight;
+        float max_disc_term;
+        float disc_single_jump;
+        int min_disp_th;
+        int msg_type;
+        bool use_local_init_data_cost;
+    private:
+        /* hidden */
+    };
+
+The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .
+
+
+ocl::StereoConstantSpaceBP::StereoConstantSpaceBP
+-----------------------------------------------------
+Enables the :ocv:class:`ocl::StereoConstantSpaceBP` constructors.
+
+.. ocv:function:: ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
+
+.. ocv:function:: ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param nr_plane: Number of disparity levels on the first level.
+
+    :param max_data_term: Truncation of data cost.
+
+    :param data_weight: Data weight.
+
+    :param max_disc_term: Truncation of discontinuity.
+
+    :param disc_single_jump: Discontinuity single jump.
+
+    :param min_disp_th: Minimal disparity threshold.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:
+
+.. math::
+
+    DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)
+
+.. math::
+
+    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
+
+For more details, see [Yang2010]_.
+
+By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+
+.. math::
+
+    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
+
+
+
+ocl::StereoConstantSpaceBP::estimateRecommendedParams
+---------------------------------------------------------
+Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).
+
+.. ocv:function:: void ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
+
+
+
+ocl::StereoConstantSpaceBP::operator ()
+-------------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
+
+.. ocv:function:: void ocl::StereoConstantSpaceBP::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
+
+    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
+
+    :param stream: Stream for the asynchronous version.
\ No newline at end of file
diff --git a/modules/ocl/doc/feature_detection_and_description.rst b/modules/ocl/doc/feature_detection_and_description.rst
index 11fb27242..d052ef8cc 100644
--- a/modules/ocl/doc/feature_detection_and_description.rst
+++ b/modules/ocl/doc/feature_detection_and_description.rst
@@ -37,7 +37,7 @@ Finds edges in an image using the [Canny86]_ algorithm.
 
 
 ocl::BruteForceMatcher_OCL_base
--------------------------------
+-----------------------------------
 .. ocv:class:: ocl::BruteForceMatcher_OCL_base
 
 Brute-force descriptor matcher. For each descriptor in the first set, this matcher finds the closest descriptor in the second set by trying each one. This descriptor matcher supports masking permissible matches between descriptor sets. ::
@@ -153,7 +153,7 @@ The class ``BruteForceMatcher_OCL_base`` has an interface similar to the class :
 
 
 ocl::BruteForceMatcher_OCL_base::match
---------------------------------------
+------------------------------------------
 Finds the best match for each descriptor from a query set with train descriptors.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::match(const oclMat& query, const oclMat& train, std::vector<DMatch>& matches, const oclMat& mask = oclMat())
@@ -169,14 +169,14 @@ Finds the best match for each descriptor from a query set with train descriptors
 
 
 ocl::BruteForceMatcher_OCL_base::makeGpuCollection
---------------------------------------------------
+------------------------------------------------------
 Performs a GPU collection of train descriptors and masks in a suitable format for the :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` function.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::makeGpuCollection(oclMat& trainCollection, oclMat& maskCollection, const vector<oclMat>& masks = std::vector<oclMat>())
 
 
 ocl::BruteForceMatcher_OCL_base::matchDownload
-----------------------------------------------
+--------------------------------------------------
 Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` to vector with :ocv:class:`DMatch`.
 
 .. ocv:function:: static void ocl::BruteForceMatcher_OCL_base::matchDownload( const oclMat& trainIdx, const oclMat& distance, std::vector<DMatch>& matches )
@@ -185,7 +185,7 @@ Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matc
 
 
 ocl::BruteForceMatcher_OCL_base::matchConvert
----------------------------------------------
+-------------------------------------------------
 Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` to vector with :ocv:class:`DMatch`.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>&matches)
@@ -195,7 +195,7 @@ Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::match
 
 
 ocl::BruteForceMatcher_OCL_base::knnMatch
------------------------------------------
+---------------------------------------------
 Finds the ``k`` best matches for each descriptor from a query set with train descriptors.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat& query, const oclMat& train, std::vector< std::vector<DMatch> >&matches, int k, const oclMat& mask = oclMat(), bool compactResult = false)
@@ -226,7 +226,7 @@ The third variant of the method stores the results in GPU memory.
 
 
 ocl::BruteForceMatcher_OCL_base::knnMatchDownload
--------------------------------------------------
+-----------------------------------------------------
 Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatch2Collection` to vector with :ocv:class:`DMatch`.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat& trainIdx, const oclMat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@@ -238,7 +238,7 @@ If ``compactResult`` is ``true`` , the ``matches`` vector does not contain match
 
 
 ocl::BruteForceMatcher_OCL_base::knnMatchConvert
-------------------------------------------------
+----------------------------------------------------
 Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatch2Collection` to CPU vector with :ocv:class:`DMatch`.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@@ -250,7 +250,7 @@ If ``compactResult`` is ``true`` , the ``matches`` vector does not contain match
 
 
 ocl::BruteForceMatcher_OCL_base::radiusMatch
---------------------------------------------
+------------------------------------------------
 For each query descriptor, finds the best matches with a distance less than a given threshold.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat& query, const oclMat& train, std::vector< std::vector<DMatch> >&matches, float maxDistance, const oclMat& mask = oclMat(), bool compactResult = false)
@@ -283,7 +283,7 @@ The third variant of the method stores the results in GPU memory and does not st
 
 
 ocl::BruteForceMatcher_OCL_base::radiusMatchDownload
-----------------------------------------------------
+--------------------------------------------------------
 Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchCollection` to vector with :ocv:class:`DMatch`.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat& trainIdx, const oclMat& distance, const oclMat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@@ -296,7 +296,7 @@ If ``compactResult`` is ``true`` , the ``matches`` vector does not contain match
 
 
 ocl::BruteForceMatcher_OCL_base::radiusMatchConvert
----------------------------------------------------
+-------------------------------------------------------
 Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchCollection` to vector with :ocv:class:`DMatch`.
 
 .. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
@@ -306,7 +306,7 @@ Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiu
 If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
 
 ocl::HOGDescriptor
-------------------
+----------------------
 
 .. ocv:struct:: ocl::HOGDescriptor
 
diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst
index 4798bddaa..5547eb29f 100644
--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@@ -3,6 +3,360 @@ Image Filtering
 
 .. highlight:: cpp
 
+ocl::BaseRowFilter_GPU
+--------------------------
+.. ocv:class:: ocl::BaseRowFilter_GPU
+
+Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+
+    class CV_EXPORTS BaseRowFilter_GPU
+    {
+    public:
+        BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+        virtual ~BaseRowFilter_GPU() {}
+        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+        int ksize, anchor, bordertype;
+    };
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`.
+
+ocl::BaseColumnFilter_GPU
+-----------------------------
+.. ocv:class:: ocl::BaseColumnFilter_GPU
+
+Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
+
+    class CV_EXPORTS BaseColumnFilter_GPU
+    {
+    public:
+        BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
+        virtual ~BaseColumnFilter_GPU() {}
+        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+        int ksize, anchor, bordertype;
+    };
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`.
+
+ocl::BaseFilter_GPU
+-----------------------
+.. ocv:class:: ocl::BaseFilter_GPU
+
+Base class for non-separable 2D filters. ::
+
+    class CV_EXPORTS BaseFilter_GPU
+    {
+    public:
+        BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
+            : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
+        virtual ~BaseFilter_GPU() {}
+        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
+        Size ksize;
+        Point anchor;
+        int borderType;
+    };
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`
+
+ocl::FilterEngine_GPU
+------------------------
+.. ocv:class:: ocl::FilterEngine_GPU
+
+Base class for the Filter Engine. ::
+
+    class CV_EXPORTS FilterEngine_GPU
+    {
+    public:
+        virtual ~FilterEngine_GPU() {}
+
+        virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
+    };
+
+The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`ocl::filter2D`, :ocv:func:`ocl::erode`, :ocv:func:`ocl::Sobel` , and others.
+
+By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
+
+    while (...)
+    {
+        ocl::oclMat src = getImg();
+        ocl::oclMat dst;
+        // Allocate and release buffers at each iterations
+        ocl::GaussianBlur(src, dst, ksize, sigma1);
+    }
+
+    // Allocate buffers only once
+    cv::Ptr<ocl::FilterEngine_GPU> filter =
+        ocl::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
+    while (...)
+    {
+        ocl::oclMat src = getImg();
+        ocl::oclMat dst;
+        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
+    }
+    // Release buffers only once
+    filter.release();
+
+
+``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
+
+.. note:: The GPU filters do not support the in-place mode.
+
+.. seealso:: :ocv:class:`ocl::BaseRowFilter_GPU`, :ocv:class:`ocl::BaseColumnFilter_GPU`, :ocv:class:`ocl::BaseFilter_GPU`, :ocv:func:`ocl::createFilter2D_GPU`, :ocv:func:`ocl::createSeparableFilter_GPU`, :ocv:func:`ocl::createBoxFilter_GPU`, :ocv:func:`ocl::createMorphologyFilter_GPU`, :ocv:func:`ocl::createLinearFilter_GPU`, :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`ocl::createDerivFilter_GPU`, :ocv:func:`ocl::createGaussianFilter_GPU`
+
+ocl::createFilter2D_GPU
+---------------------------
+Creates a non-separable filter engine with the specified filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createFilter2D_GPU( const Ptr<BaseFilter_GPU> filter2D)
+
+    :param filter2D: Non-separable 2D filter.
+
+Usually this function is used inside such high-level functions as :ocv:func:`ocl::createLinearFilter_GPU`, :ocv:func:`ocl::createBoxFilter_GPU`.
+
+
+ocl::createSeparableFilter_GPU
+----------------------------------
+Creates a separable filter engine with the specified filters.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter, const Ptr<BaseColumnFilter_GPU> &columnFilter)
+
+    :param rowFilter: "Horizontal" 1D filter.
+
+    :param columnFilter: "Vertical" 1D filter.
+
+Usually this function is used inside such high-level functions as :ocv:func:`ocl::createSeparableLinearFilter_GPU`.
+
+ocl::createBoxFilter_GPU
+----------------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createBoxFilter_GPU(int srcType, int dstType, const Size &ksize, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getBoxFilter_GPU(int srcType, int dstType, const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
+
+    :param dstType: Output image type.  It supports only the same values as the source type.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`boxFilter`
+
+ocl::boxFilter
+------------------
+Smooths the image using the normalized box filter.
+
+.. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+
+    :param dst: Output image type. The size and type is the same as ``src`` .
+
+    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
+
+Smoothes image using box filter.Supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4.
+
+.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+ocl::blur
+-------------
+Acts as a synonym for the normalized box filter.
+
+.. ocv:function:: void ocl::blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_CONSTANT)
+
+    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+
+    :param dst: Output image type with the same size and type as  ``src`` .
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
+
+    :param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`blur`, :ocv:func:`ocl::boxFilter`
+
+ocl::createMorphologyFilter_GPU
+-----------------------------------
+Creates a 2D morphological filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Point &anchor = Point(-1, -1), int iterations = 1)
+
+.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor = Point(-1, -1))
+
+    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
+
+    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
+
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`createMorphologyFilter`
+
+ocl::createLinearFilter_GPU
+-------------------------------
+Creates a non-separable linear filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. The same type as ``src`` is supported.
+
+    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`ocl::convolve`.
+
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+.. seealso:: :ocv:func:`createLinearFilter`
+
+
+ocl::filter2D
+-----------------
+Applies the non-separable 2D linear filter to an image.
+
+.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
+
+    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
+
+    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
+
+    :param kernel: 2D array of filter coefficients.
+
+    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param stream: Stream for the asynchronous version.
+
+ocl::getLinearRowFilter_GPU
+-------------------------------
+Creates a primitive row filter with the specified kernel.
+
+.. ocv:function:: Ptr<BaseRowFilter_GPU> ocl::getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel, int anchor = -1, int bordertype = BORDER_DEFAULT)
+
+    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
+
+    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`createSeparableLinearFilter` .
+
+
+ocl::getLinearColumnFilter_GPU
+----------------------------------
+Creates a primitive column filter with the specified kernel.
+
+.. ocv:function:: Ptr<BaseColumnFilter_GPU> ocl::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0)
+
+    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
+
+    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
+
+    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+
+    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` .
+
+    :param delta: default value is 0.0.
+
+.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+
+ocl::createSeparableLinearFilter_GPU
+----------------------------------------
+Creates a separable linear filter engine.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
+
+    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param delta: default value is 0.0.
+
+    :param bordertype: Pixel extrapolation method.
+
+.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+
+
+ocl::sepFilter2D
+--------------------
+Applies a separable 2D linear filter to an image.
+
+.. ocv:function:: void ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
+
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dst: Destination image with the same size and number of channels as  ``src`` .
+
+    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+
+    :param kernelX: Horizontal filter coefficients.
+
+    :param kernelY: Vertical filter coefficients.
+
+    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
+
+    :param delta: default value is 0.0.
+
+    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
+
+ocl::createDerivFilter_GPU
+------------------------------
+Creates a filter engine for the generalized Sobel operator.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
+
+    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
+
+
 ocl::Sobel
 ------------------
 Returns void
@@ -53,8 +407,26 @@ Returns void
 
 The function computes the first x- or y- spatial image derivative using Scharr operator. Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.
 
+ocl::createGaussianFilter_GPU
+---------------------------------
+Creates a Gaussian filter engine.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
+
+    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
+
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+
+    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
+
+.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
+
 ocl::GaussianBlur
-------------------
+---------------------
 Returns void
 
 .. ocv:function:: void ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
@@ -71,26 +443,6 @@ Returns void
 
 The function convolves the source image with the specified Gaussian kernel. In-place filtering is supported.  Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.
 
-ocl::boxFilter
-------------------
-Returns void
-
-.. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size and the same type as src
-
-    :param ddepth: The desired depth of the destination image
-
-    :param ksize: The smoothing kernel size. It must be positive and odd
-
-    :param anchor: The anchor point. The default value Point(-1,-1) means that the anchor is at the kernel center.
-
-    :param bordertype: Pixel extrapolation method.
-
-Smoothes image using box filter.Supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4.
-
 ocl::Laplacian
 ------------------
 Returns void
@@ -159,7 +511,7 @@ Returns void
 Convolves an image with the kernel. Supports only CV_32FC1 data types and do not support ROI.
 
 ocl::bilateralFilter
---------------------
+------------------------
 Returns void
 
 .. ocv:function:: void ocl::bilateralFilter(const oclMat &src, oclMat &dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT)
@@ -178,8 +530,42 @@ Returns void
 
 Applies bilateral filter to the image. Supports 8UC1 8UC4 data types.
 
+ocl::adaptiveBilateralFilter
+--------------------------------
+Returns void
+
+.. ocv:function:: void ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT)
+
+    :param src: The source image
+
+    :param dst: The destination image; will have the same size and the same type as src
+
+    :param ksize: The kernel size
+
+    :param sigmaSpace: Filter sigma in the coordinate space. Larger value of the parameter means that farther pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
+
+    :param borderType: Pixel extrapolation method.
+
+A main part of our strategy will be to load each raw pixel once, and reuse it to calculate all pixels in the output (filtered) image that need this pixel value.
+
+.. math::
+
+    \emph{O}_i = \frac{1}{W_i}\sum\limits_{j\in{N(i)}}{\frac{1}{1+\frac{(V_i-V_j)^2}{\sigma_{N{'}(i)}^2}}*\frac{1}{1+\frac{d(i,j)^2}{\sum^2}}}V_j
+
+Local memory organization
+
+
+.. image:: images/adaptiveBilateralFilter.jpg
+                 :height: 250pt
+                 :width:  350pt
+                 :alt: Introduction Icon
+
+.. note:: We partition the image to non-overlapping blocks of size (Ux, Uy). Each such block will correspond to the pixel locations where we will calculate the filter result in one workgroup. Considering neighbourhoods of sizes (kx, ky), where kx = 2 dx + 1, and ky = 2 dy + 1 (in image ML, dx = dy = 1, and kx = ky = 3), it is clear that we need to load data of size Wx = Ux + 2 dx, Wy = Uy + 2 dy. Furthermore, if (Sx, Sy) is the top left pixel coordinates for a particular block, and (Sx + Ux - 1, Sy + Uy -1) is to botom right coordinate of the block, we need to load data starting at top left coordinate (PSx, PSy) = (Sx - dx, Sy - dy), and ending at bottom right coordinate (Sx + Ux - 1 + dx, Sy + Uy - 1 + dy). The workgroup layout is (Wx,1). However, to take advantage of the natural hardware properties (preferred wavefront sizes), we restrict Wx to be a multiple of that preferred wavefront size (for current AMD hardware this is typically 64). Each thread in the workgroup will load Wy elements (under the constraint that Wx*Wy*pixel width <= max local memory).
+
+Applies bilateral filter to the image. Supports 8UC1 8UC3 data types.
+
 ocl::copyMakeBorder
---------------------
+-----------------------
 Returns void
 
 .. ocv:function:: void ocl::copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar())
@@ -241,7 +627,7 @@ Returns void
 The function erodes the source image using the specified structuring element that determines the shape of a pixel neighborhood over which the minimum is taken. Supports 8UC1 8UC4 data types.
 
 ocl::morphologyEx
-------------------
+---------------------
 Returns void
 
 .. ocv:function:: void ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue())
@@ -277,7 +663,6 @@ Smoothes an image and downsamples it.
 .. seealso:: :ocv:func:`pyrDown`
 
 
-
 ocl::pyrUp
 -------------------
 Upsamples an image and then smoothes it.
@@ -302,7 +687,7 @@ Computes a vertical (column) sum.
 
 
 ocl::blendLinear
--------------------
+--------------------
 Performs linear blending of two images.
 
 .. ocv:function:: void ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result)
diff --git a/modules/ocl/doc/image_processing.rst b/modules/ocl/doc/image_processing.rst
index 98f334762..c6c52786c 100644
--- a/modules/ocl/doc/image_processing.rst
+++ b/modules/ocl/doc/image_processing.rst
@@ -3,8 +3,82 @@ Image Processing
 
 .. highlight:: cpp
 
+ocl::meanShiftFiltering
+---------------------------
+Performs mean-shift filtering for each point of the source image.
+
+.. ocv:function:: void ocl::meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dst: Destination image containing the color of mapped points. It has the same size and type as  ``src`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+It maps each point of the source image into another point. As a result, you have a new color and new position of each point.
+
+
+ocl::meanShiftProc
+----------------------
+Performs a mean-shift procedure and stores information about processed points (their colors and positions) in two images.
+
+.. ocv:function:: void ocl::meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dstr: Destination image containing the color of mapped points. The size and type is the same as  ``src`` .
+
+    :param dstsp: Destination image containing the position of mapped points. The size is the same as  ``src`` size. The type is  ``CV_16SC2`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+.. seealso:: :ocv:func:`ocl::meanShiftFiltering`
+
+
+ocl::meanShiftSegmentation
+------------------------------
+Performs a mean-shift segmentation of the source image and eliminates small segments.
+
+.. ocv:function:: void ocl::meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
+
+    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
+
+    :param dst: Segmented image with the same size and type as  ``src`` .
+
+    :param sp: Spatial window radius.
+
+    :param sr: Color window radius.
+
+    :param minsize: Minimum segment size. Smaller segments are merged.
+
+    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
+
+ocl::integral
+-----------------
+Computes an integral image.
+
+.. ocv:function:: void ocl::integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
+
+.. ocv:function:: void ocl::integral(const oclMat &src, oclMat &sum)
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
+
+    :param sqsum: Sqsum values is ``CV_32FC1`` type.
+
+.. seealso:: :ocv:func:`integral`
+
 ocl::cornerHarris
-------------------
+---------------------
 Returns void
 
 .. ocv:function:: void ocl::cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT)
@@ -24,7 +98,7 @@ Returns void
 Calculate Harris corner.
 
 ocl::cornerMinEigenVal
-------------------------
+--------------------------
 Returns void
 
 .. ocv:function:: void ocl::cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT)
@@ -53,6 +127,19 @@ Returns void
 
 Calculates histogram of one or more arrays. Supports only 8UC1 data type.
 
+ocl::equalizeHist
+---------------------
+Equalizes the histogram of a grayscale image.
+
+.. ocv:function:: void ocl::equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
+
+    :param mat_src: Source image.
+
+    :param mat_dst: Destination image.
+
+.. seealso:: :ocv:func:`equalizeHist`
+
+
 ocl::remap
 ------------------
 Returns void
@@ -96,7 +183,7 @@ Returns void
 Resizes an image. Supports CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 , CV_32FC3 and CV_32FC4 data types.
 
 ocl::warpAffine
-------------------
+-------------------
 Returns void
 
 .. ocv:function:: void ocl::warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR)
@@ -114,7 +201,7 @@ Returns void
 The function warpAffine transforms the source image using the specified matrix. Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC types.
 
 ocl::warpPerspective
----------------------
+------------------------
 Returns void
 
 .. ocv:function:: void ocl::warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR)
@@ -209,7 +296,7 @@ Builds transformation maps for perspective transformation.
 
 
 ocl::buildWarpAffineMaps
-------------------------
+----------------------------
 Builds transformation maps for affine transformation.
 
 .. ocv:function:: void ocl::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, oclMat& xmap, oclMat& ymap)
@@ -226,114 +313,6 @@ Builds transformation maps for affine transformation.
 
 .. seealso:: :ocv:func:`ocl::warpAffine` , :ocv:func:`ocl::remap`
 
-ocl::PyrLKOpticalFlow
----------------------
-.. ocv:class:: ocl::PyrLKOpticalFlow
-
-Class used for calculating an optical flow. ::
-
-    class PyrLKOpticalFlow
-    {
-    public:
-        PyrLKOpticalFlow();
-
-        void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts,
-            oclMat& status, oclMat* err = 0);
-
-        void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0);
-
-        Size winSize;
-        int maxLevel;
-        int iters;
-        double derivLambda;
-        bool useInitialFlow;
-        float minEigThreshold;
-        bool getMinEigenVals;
-
-        void releaseMemory();
-    };
-
-The class can calculate an optical flow for a sparse feature set or dense optical flow using the iterative Lucas-Kanade method with pyramids.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-.. note::
-
-   (Ocl) An example the Lucas Kanade optical flow pyramid method can be found at opencv_source_code/samples/ocl/pyrlk_optical_flow.cpp
-   (Ocl) An example for square detection can be found at opencv_source_code/samples/ocl/squares.cpp
-
-ocl::PyrLKOpticalFlow::sparse
------------------------------
-Calculate an optical flow for a sparse feature set.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err = 0)
-
-    :param prevImg: First 8-bit input image (supports both grayscale and color images).
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param prevPts: Vector of 2D points for which the flow needs to be found. It must be one row matrix with CV_32FC2 type.
-
-    :param nextPts: Output vector of 2D points (with single-precision floating-point coordinates) containing the calculated new positions of input features in the second image. When ``useInitialFlow`` is true, the vector must have the same size as in the input.
-
-    :param status: Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the flow for the corresponding features has been found. Otherwise, it is set to 0.
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-
-
-ocl::PyrLKOpticalFlow::dense
------------------------------
-Calculate dense optical flow.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0)
-
-    :param prevImg: First 8-bit grayscale input image.
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param u: Horizontal component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param v: Vertical component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-
-
-ocl::PyrLKOpticalFlow::releaseMemory
-------------------------------------
-Releases inner buffers memory.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::releaseMemory()
-
-
-ocl::interpolateFrames
-----------------------
-Interpolate frames (images) using provided optical flow (displacement field).
-
-.. ocv:function:: void ocl::interpolateFrames(const oclMat& frame0, const oclMat& frame1, const oclMat& fu, const oclMat& fv, const oclMat& bu, const oclMat& bv, float pos, oclMat& newFrame, oclMat& buf)
-
-    :param frame0: First frame (32-bit floating point images, single channel).
-
-    :param frame1: Second frame. Must have the same type and size as ``frame0`` .
-
-    :param fu: Forward horizontal displacement.
-
-    :param fv: Forward vertical displacement.
-
-    :param bu: Backward horizontal displacement.
-
-    :param bv: Backward vertical displacement.
-
-    :param pos: New frame position.
-
-    :param newFrame: Output image.
-
-    :param buf: Temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat: occlusion masks for first frame, occlusion masks for second, interpolated forward horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow, interpolated backward vertical flow.
-
-
 ocl::HoughCircles
 -----------------
 Finds circles in a grayscale image using the Hough transform.
diff --git a/modules/ocl/doc/images/adaptiveBilateralFilter.jpg b/modules/ocl/doc/images/adaptiveBilateralFilter.jpg
new file mode 100644
index 000000000..6508f693c
Binary files /dev/null and b/modules/ocl/doc/images/adaptiveBilateralFilter.jpg differ
diff --git a/modules/ocl/doc/matrix_reductions.rst b/modules/ocl/doc/matrix_reductions.rst
index aed9fa564..350f86103 100644
--- a/modules/ocl/doc/matrix_reductions.rst
+++ b/modules/ocl/doc/matrix_reductions.rst
@@ -4,7 +4,7 @@ Matrix Reductions
 .. highlight:: cpp
 
 ocl::countNonZero
-------------------
+---------------------
 Returns the number of non-zero elements in src
 
 .. ocv:function:: int ocl::countNonZero(const oclMat &src)
@@ -55,16 +55,26 @@ Returns the sum of matrix elements for each channel
 
 .. ocv:function:: Scalar ocl::sum(const oclMat &m)
 
-    :param m: The Source image of all depth
+    :param m: The Source image of all depth.
 
 Counts the sum of matrix elements for each channel.
 
+ocl::absSum
+---------------
+Returns the sum of absolute values for matrix elements.
+
+.. ocv:function:: Scalar ocl::absSum(const oclMat &m)
+
+    :param m: The Source image of all depth.
+
+Counts the abs sum of matrix elements for each channel.
+
 ocl::sqrSum
 ------------------
 Returns the squared sum of matrix elements for each channel
 
 .. ocv:function:: Scalar ocl::sqrSum(const oclMat &m)
 
-    :param m: The Source image of all depth
+    :param m: The Source image of all depth.
 
 Counts the squared sum of matrix elements for each channel.
diff --git a/modules/ocl/doc/ml_machine_learning.rst b/modules/ocl/doc/ml_machine_learning.rst
new file mode 100644
index 000000000..321cec9db
--- /dev/null
+++ b/modules/ocl/doc/ml_machine_learning.rst
@@ -0,0 +1,88 @@
+ml.Machine Learning
+=============================
+
+.. highlight:: cpp
+
+ocl::KNearestNeighbour
+--------------------------
+.. ocv:class:: ocl::KNearestNeighbour : public ocl::CvKNearest
+
+The class implements K-Nearest Neighbors model as described in the beginning of this section.
+
+ocl::KNearestNeighbour
+--------------------------
+Computes the weighted sum of two arrays. ::
+
+    class CV_EXPORTS KNearestNeighbour: public CvKNearest
+    {
+    public:
+        KNearestNeighbour();
+        ~KNearestNeighbour();
+
+        bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
+            bool isRegression = false, int max_k = 32, bool updateBase = false);
+
+        void clear();
+
+        void find_nearest(const oclMat& samples, int k, oclMat& lables);
+
+    private:
+        /* hidden */
+    };
+
+ocl::KNearestNeighbour::train
+---------------------------------
+Trains the model.
+
+.. ocv:function:: bool ocl::KNearestNeighbour::train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)), bool isRegression = false, int max_k = 32, bool updateBase = false)
+
+    :param isRegression: Type of the problem: ``true`` for regression and ``false`` for classification.
+
+    :param maxK: Number of maximum neighbors that may be passed to the method :ocv:func:`CvKNearest::find_nearest`.
+
+    :param updateBase: Specifies whether the model is trained from scratch (``update_base=false``), or it is updated using the new training data (``update_base=true``). In the latter case, the parameter ``maxK`` must not be larger than the original value.
+
+The method trains the K-Nearest model. It follows the conventions of the generic :ocv:func:`CvStatModel::train` approach with the following limitations:
+
+* Only ``CV_ROW_SAMPLE`` data layout is supported.
+* Input variables are all ordered.
+* Output variables can be either categorical ( ``is_regression=false`` ) or ordered ( ``is_regression=true`` ).
+* Variable subsets (``var_idx``) and missing measurements are not supported.
+
+ocl::KNearestNeighbour::find_nearest
+----------------------------------------
+Finds the neighbors and predicts responses for input vectors.
+
+.. ocv:function:: void ocl::KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lables )
+
+    :param samples: Input samples stored by rows. It is a single-precision floating-point matrix of :math:`number\_of\_samples \times number\_of\_features` size.
+
+    :param k: Number of used nearest neighbors. It must satisfy constraint: :math:`k \le` :ocv:func:`CvKNearest::get_max_k`.
+
+    :param labels: Vector with results of prediction (regression or classification) for each input sample. It is a single-precision floating-point vector with ``number_of_samples`` elements.
+
+ocl::kmeans
+---------------
+Finds centers of clusters and groups input samples around the clusters.
+
+.. ocv:function:: double ocl::kmeans(const oclMat &src, int K, oclMat &bestLabels, TermCriteria criteria, int attemps, int flags, oclMat &centers)
+
+    :param src: Floating-point matrix of input samples, one row per sample.
+
+    :param K: Number of clusters to split the set by.
+
+    :param bestLabels: Input/output integer array that stores the cluster indices for every sample.
+
+    :param criteria: The algorithm termination criteria, that is, the maximum number of iterations and/or the desired accuracy. The accuracy is specified as ``criteria.epsilon``. As soon as each of the cluster centers moves by less than ``criteria.epsilon`` on some iteration, the algorithm stops.
+
+    :param attempts: Flag to specify the number of times the algorithm is executed using different initial labellings. The algorithm returns the labels that yield the best compactness (see the last function parameter).
+
+    :param flags: Flag that can take the following values:
+
+            * **KMEANS_RANDOM_CENTERS** Select random initial centers in each attempt.
+
+            * **KMEANS_PP_CENTERS** Use ``kmeans++`` center initialization by Arthur and Vassilvitskii [Arthur2007].
+
+            * **KMEANS_USE_INITIAL_LABELS** During the first (and possibly the only) attempt, use the user-supplied labels instead of computing them from the initial centers. For the second and further attempts, use the random or semi-random centers. Use one of  ``KMEANS_*_CENTERS``  flag to specify the exact method.
+
+    :param centers: Output matrix of the cluster centers, one row per each cluster center.
\ No newline at end of file
diff --git a/modules/ocl/doc/object_detection.rst b/modules/ocl/doc/object_detection.rst
index 024ce5268..0539e7738 100644
--- a/modules/ocl/doc/object_detection.rst
+++ b/modules/ocl/doc/object_detection.rst
@@ -4,7 +4,7 @@ Object Detection
 .. highlight:: cpp
 
 ocl::OclCascadeClassifier
--------------------------
+-----------------------------
 .. ocv:class:: ocl::OclCascadeClassifier : public CascadeClassifier
 
 Cascade classifier class used for object detection. Supports HAAR cascade classifier  in the form of cross link ::
@@ -21,20 +21,22 @@ Cascade classifier class used for object detection. Supports HAAR cascade classi
 
    (Ocl) A face detection example using cascade classifiers can be found at opencv_source_code/samples/ocl/facedetect.cpp
 
-ocl::OclCascadeClassifier::oclHaarDetectObjects
+ocl::OclCascadeClassifier::detectMultiScale
 ------------------------------------------------------
 Detects objects of different sizes in the input image.
 
 .. ocv:function:: void ocl::OclCascadeClassifier::detectMultiScale(oclMat &image, std::vector<cv::Rect>& faces, double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, Size minSize = Size(), Size maxSize = Size())
 
-    :param image:  Matrix of type CV_8U containing an image where objects should be detected.
-
     :param faces: Vector of rectangles where each rectangle contains the detected object.
 
+    :param image:  Matrix of type CV_8U containing an image where objects should be detected.
+
     :param scaleFactor: Parameter specifying how much the image size is reduced at each image scale.
 
     :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.
 
+    :param flags: Parameter with the same meaning for an old cascade as in the function ``cvHaarDetectObjects``. It is not used for a new cascade.
+
     :param minSize: Minimum possible object size. Objects smaller than that are ignored.
 
     :param maxSize: Maximum possible object size. Objects larger than that are ignored.
@@ -42,7 +44,7 @@ Detects objects of different sizes in the input image.
 The function provides a very similar interface with that in CascadeClassifier class, except using oclMat as input image.
 
 ocl::MatchTemplateBuf
----------------------
+-------------------------
 .. ocv:struct:: ocl::MatchTemplateBuf
 
 Class providing memory buffers for :ocv:func:`ocl::matchTemplate` function, plus it allows to adjust some specific parameters. ::
@@ -59,7 +61,7 @@ Class providing memory buffers for :ocv:func:`ocl::matchTemplate` function, plus
 You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::matchTemplate` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
 
 ocl::matchTemplate
-------------------
+----------------------
 Computes a proximity map for a raster template and an image where the template is searched for.
 
 .. ocv:function:: void ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
diff --git a/modules/ocl/doc/ocl.rst b/modules/ocl/doc/ocl.rst
index d6d79e187..76c1f882b 100644
--- a/modules/ocl/doc/ocl.rst
+++ b/modules/ocl/doc/ocl.rst
@@ -12,7 +12,10 @@ ocl. OpenCL-accelerated Computer Vision
     matrix_reductions
     image_filtering
     image_processing
+    ml_machine_learning
     object_detection
     feature_detection_and_description
+    video_analysis
+    camera_calibration_and_3D_reconstruction
 ..    camera_calibration_and_3d_reconstruction
 ..    video
diff --git a/modules/ocl/doc/operations_on_matrices.rst b/modules/ocl/doc/operations_on_matrices.rst
index e47e72092..7eaaf0d81 100644
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
@@ -4,204 +4,224 @@ Operations on Matrics
 .. highlight:: cpp
 
 ocl::oclMat::convertTo
-----------------------
+--------------------------
 Returns void
 
-.. ocv:function:: void ocl::oclMat::convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const
+.. ocv:function:: void ocl::oclMat::convertTo(oclMat &m, int rtype, double alpha = 1, double beta = 0) const
 
-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: the destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.
 
-    :param rtype: The desired destination matrix type, or rather, the depth(since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
+    :param rtype: the desired destination matrix type, or rather, the depth (since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
 
-    :param alpha: must be default now
+    :param alpha: optional scale factor.
 
-    :param beta: must be default now
+    :param beta: optional delta added to the scaled values.
 
-The method converts source pixel values to the target datatype. saturate cast is applied in the end to avoid possible overflows. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+The method converts source pixel values to the target datatype. Saturate cast is applied in the end to avoid possible overflows. Supports all data types.
 
 ocl::oclMat::copyTo
--------------------
+-----------------------
 Returns void
 
-.. ocv:function:: void ocl::oclMat::copyTo( oclMat &m, const oclMat &mask ) const
+.. ocv:function:: void ocl::oclMat::copyTo(oclMat &m, const oclMat &mask = oclMat()) const
 
-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.
 
-    :param mask(optional): The operation mask. Its non-zero elements indicate, which matrix elements need to be copied
+    :param mask: The operation mask. Its non-zero elements indicate, which matrix elements need to be copied.
 
-Copies the matrix to another one. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4
+Copies the matrix to another one. Supports all data types.
 
 ocl::oclMat::setTo
-------------------
+----------------------
 Returns oclMat
 
 .. ocv:function:: oclMat& ocl::oclMat::setTo(const Scalar &s, const oclMat &mask = oclMat())
 
-    :param s: Assigned scalar, which is converted to the actual array type
+    :param s: Assigned scalar, which is converted to the actual array type.
 
-    :param mask: The operation mask of the same size as ``*this``
+    :param mask: The operation mask of the same size as ``*this`` and type ``CV_8UC1``.
 
-Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports all data types.
 
 ocl::absdiff
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::absdiff( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const oclMat& src2, oclMat& dst)
 
-.. ocv:function:: void ocl::absdiff( const oclMat& a, const Scalar& s, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const Scalar& s, oclMat& dst)
 
+    :param src1: the first input array.
 
-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param b: The second input array, must be the same size and same type as a
+    :param s: scalar, the second input parameter.
 
-    :param s: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as a
-
-Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types.
 
 ocl::add
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c )
+.. ocv:function:: void ocl::add(const oclMat & src1, const oclMat & src2, oclMat & dst, const oclMat & mask = oclMat())
 
-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c, const oclMat & mask )
+.. ocv:function:: void ocl::add(const oclMat & src1, const Scalar & s, oclMat & dst, const oclMat & mask = oclMat())
 
-.. ocv:function:: void ocl::add( const oclMat & a, const Scalar & sc, oclMat & c, const oclMat & mask=oclMat() )
+    :param src1: the first input array.
 
-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param s: scalar, the second input parameter
 
-    :param sc: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+Computes per-element additon between two arrays or between array and a scalar. Supports all data types.
 
-Computes per-element additon between two arrays or between array and a scalar. Supports all data types except CV_8S.
+ocl::addWeighted
+--------------------
+Computes the weighted sum of two arrays.
+
+.. ocv:function:: void ocl::addWeighted(const oclMat& src1, double  alpha, const oclMat& src2, double beta, double gama, oclMat& dst)
+
+    :param src1: First source array.
+
+    :param alpha: Weight for the first array elements.
+
+    :param src2: Second source array of the same size and channel number as  ``src1`` .
+
+    :param beta: Weight for the second array elements.
+
+    :param dst: Destination array that has the same size and number of channels as the input arrays.
+
+    :param gamma: Scalar added to each sum.
+
+The function ``addWeighted`` calculates the weighted sum of two arrays as follows:
+
+.. math::
+
+    \texttt{c} (I)= \texttt{saturate} ( \texttt{a} (I)* \texttt{alpha} +  \texttt{b} (I)* \texttt{beta} +  \texttt{gamma} )
+
+where ``I`` is a multi-dimensional index of array elements. In case of multi-channel arrays, each channel is processed independently.
+
+.. seealso:: :ocv:func:`addWeighted`
 
 ocl::subtract
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c, const oclMat& mask )
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::subtract( const oclMat& a, const Scalar& sc, oclMat& c, const oclMat& mask=oclMat() )
+    :param src1: the first input array.
 
-.. ocv:function:: void ocl::subtract( const Scalar& sc, const oclMat& a, oclMat& c, const oclMat& mask=oclMat() )
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
+    :param s: scalar, the second input parameter.
 
-    :param a: The first input array
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-    :param sc: Scalar, the second input parameter
-
-    :param c: The destination array, it will have the same size and same type as src1
-
-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
-
-Computes per-element subtract between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element subtract between two arrays or between array and a scalar. Supports all data types.
 
 ocl::multiply
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::multiply( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::multiply(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)
 
-    :param a: The first input array
+    :param src1: the first input array.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param scale: must be 1 now
+    :param scale: optional scale factor.
 
-Computes per-element multiply between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element multiply between two arrays or between array and a scalar. Supports all data types.
 
 ocl::divide
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::divide( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::divide(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)
 
-.. ocv:function:: void ocl::divide( double scale, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::divide(double scale, const oclMat& src1, oclMat& dst)
 
-    :param a: The first input array
+    :param src1: the first input array.
 
-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param scale: must be 1 now
+    :param scale: scalar factor.
 
-Computes per-element divide between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element divide between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_and
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-    :param src1: The first input array
+    :param src1: the first input array.
 
-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_or
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-    :param src1: The first input array
+    :param src1: the first input array.
 
-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_xor
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
 
-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
 
-    :param src1: The first input array
+    :param src1: the first input array.
 
-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.
 
-    :param sc: Scalar, the second input parameter
+    :param sc: scalar, the second input parameter.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.
 
-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
 
-Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types.
 
 ocl::bitwise_not
 ------------------
@@ -209,11 +229,11 @@ Returns void
 
 .. ocv:function:: void ocl::bitwise_not(const oclMat &src, oclMat &dst)
 
-    :param src: The input array
+    :param src: the input array.
 
-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src``.
 
-The functions bitwise not compute per-element bit-wise inversion of the source array:. Supports all data types except CV_8S.
+The functions bitwise not compute per-element bit-wise inversion of the source array. Supports all data types.
 
 ocl::cartToPolar
 ------------------
@@ -221,17 +241,17 @@ Returns void
 
 .. ocv:function:: void ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false)
 
-    :param x: The array of x-coordinates; must be single-precision or double-precision floating-point array
+    :param x: the array of x-coordinates; must be single-precision or double-precision floating-point array.
 
-    :param y: The array of y-coordinates; it must have the same size and same type as x
+    :param y: the array of y-coordinates; it must have the same size and same type as ``x``.
 
-    :param magnitude: The destination array of magnitudes of the same size and same type as x
+    :param magnitude: the destination array of magnitudes of the same size and same type as ``x``.
 
-    :param angle: The destination array of angles of the same size and same type as x. The angles are measured in radians (0 to 2pi ) or in degrees (0 to 360 degrees).
+    :param angle: the destination array of angles of the same size and same type as ``x``. The angles are measured in radians (0 to 2pi) or in degrees (0 to 360 degrees).
 
-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.
 
-Calculates the magnitude and angle of 2d vectors. Supports only CV_32F and CV_64F data types.
+Calculates the magnitude and angle of 2D vectors. Supports only ``CV_32F`` and ``CV_64F`` data types.
 
 ocl::polarToCart
 ------------------
@@ -239,57 +259,57 @@ Returns void
 
 .. ocv:function:: void ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false)
 
-    :param magnitude: The source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are =1. If it's not empty, it must have the same size and same type as angle
+    :param magnitude: the source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are = 1. If it's not empty, it must have the same size and same type as ``angle``.
 
-    :param angle: The source floating-point array of angles of the 2D vectors
+    :param angle: the source floating-point array of angles of the 2D vectors.
 
-    :param x: The destination array of x-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param x: the destination array of x-coordinates of 2D vectors; will have the same size and the same type as ``angle``.
 
-    :param y: The destination array of y-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param y: the destination array of y-coordinates of 2D vectors; will have the same size and the same type as ``angle``.
 
-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.
 
-The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only CV_32F and CV_64F data types.
+The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only ``CV_32F`` and ``CV_64F`` data types.
 
 ocl::compare
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop)
+.. ocv:function:: void ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop)
 
-    :param a: The first source array
+    :param src1: the first source array.
 
-    :param b: The second source array; must have the same size and same type as a
+    :param src2: the second source array; must have the same size and same type as ``src1``.
 
-    :param c: The destination array; will have the same size as a
+    :param dst: the destination array; will have the same size as ``src1`` and type ``CV_8UC1``.
 
-    :param cmpop: The flag specifying the relation between the elements to be checked
+    :param cmpop: the flag specifying the relation between the elements to be checked.
 
-Performs per-element comparison of two arrays or an array and scalar value. Supports all the 1 channel data types except CV_8S.
+Performs per-element comparison of two arrays or an array and scalar value. Supports all data types.
 
 ocl::exp
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::exp(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::exp(const oclMat &src, oclMat &dst)
 
-    :param a: The first source array
+    :param src: the first source array.
 
-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.
 
-The function exp calculates the exponent of every element of the input array. Supports only CV_32FC1 data type.
+The function exp calculates the exponent of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
 
 ocl::log
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::log(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::log(const oclMat &src, oclMat &dst)
 
-    :param a: The first source array
+    :param src: the first source array.
 
-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.
 
-The function log calculates the log of every element of the input array. Supports only CV_32FC1 data type.
+The function log calculates the log of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
 
 ocl::LUT
 ------------------
@@ -297,13 +317,13 @@ Returns void
 
 .. ocv:function:: void ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 
-    :param src: Source array of 8-bit elements
+    :param src: source array of 8-bit elements.
 
-    :param lut: Look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array
+    :param lut: look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array.
 
-    :param dst: Destination array; will have the same size and the same number of channels as src, and the same depth as lut
+    :param dst: destination array; will have the same size and the same number of channels as ``src``, and the same depth as ``lut``.
 
-Performs a look-up table transform of an array. Supports only CV_8UC1 and CV_8UC4 data type.
+Performs a look-up table transform of an array.
 
 ocl::magnitude
 ------------------
@@ -311,25 +331,25 @@ Returns void
 
 .. ocv:function:: void ocl::magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude)
 
-    :param x: The floating-point array of x-coordinates of the vectors
+    :param x: the floating-point array of x-coordinates of the vectors.
 
-    :param y: he floating-point array of y-coordinates of the vectors; must have the same size as x
+    :param y: the floating-point array of y-coordinates of the vectors; must have the same size as ``x``.
 
-    :param magnitude: The destination array; will have the same size and same type as x
+    :param magnitude: the destination array; will have the same size and same type as ``x``.
 
-The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of x and y arrays. Supports only CV_32F and CV_64F data type.
+The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of ``x`` and ``y`` arrays. Supports only ``CV_32F`` and ``CV_64F`` data types.
 
 ocl::flip
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::flip( const oclMat& a, oclMat& b, int flipCode )
+.. ocv:function:: void ocl::flip(const oclMat& src, oclMat& dst, int flipCode)
 
-    :param a: Source image.
+    :param src: source image.
 
-    :param b: Destination image
+    :param dst: destination image.
 
-    :param flipCode: Specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
+    :param flipCode: specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
 
 The function flip flips the array in one of three different ways (row and column indices are 0-based). Supports all data types.
 
@@ -339,13 +359,13 @@ Returns void
 
 .. ocv:function:: void ocl::meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev)
 
-    :param mtx: Source image.
+    :param mtx: source image.
 
-    :param mean: The output parameter: computed mean value
+    :param mean: the output parameter: computed mean value.
 
-    :param stddev: The output parameter: computed standard deviation
+    :param stddev: the output parameter: computed standard deviation.
 
-The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except CV_32F,CV_64F
+The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except ``CV_32F``, ``CV_64F``.
 
 ocl::merge
 ------------------
@@ -353,9 +373,9 @@ Returns void
 
 .. ocv:function:: void ocl::merge(const vector<oclMat> &src, oclMat &dst)
 
-    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type
+    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type.
 
-    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices
+    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices.
 
 Composes a multi-channel array from several single-channel arrays. Supports all data types.
 
@@ -379,13 +399,13 @@ Returns the calculated norm
 
 .. ocv:function:: double ocl::norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2)
 
-    :param src1: The first source array
+    :param src1: the first source array.
 
-    :param src2: The second source array of the same size and the same type as src1
+    :param src2: the second source array of the same size and the same type as ``src1``.
 
-    :param normType: Type of the norm
+    :param normType: type of the norm.
 
-Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only CV_8UC1 data type.
+Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only ``CV_8UC1`` data type.
 
 ocl::phase
 ------------------
@@ -393,15 +413,15 @@ Returns void
 
 .. ocv:function:: void ocl::phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false)
 
-    :param x: The source floating-point array of x-coordinates of 2D vectors
+    :param x: the source floating-point array of x-coordinates of 2D vectors
 
-    :param y: The source array of y-coordinates of 2D vectors; must have the same size and the same type as x
+    :param y: the source array of y-coordinates of 2D vectors; must have the same size and the same type as ``x``.
 
-    :param angle: The destination array of vector angles; it will have the same size and same type as x
+    :param angle: the destination array of vector angles; it will have the same size and same type as ``x``.
 
-    :param angleInDegrees: When it is true, the function will compute angle in degrees, otherwise they will be measured in radians
+    :param angleInDegrees: when it is true, the function will compute angle in degrees, otherwise they will be measured in radians.
 
-The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of x and y. Supports only CV_32FC1 and CV_64FC1 data type.
+The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of ``x`` and ``y``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data type.
 
 ocl::pow
 ------------------
@@ -409,13 +429,13 @@ Returns void
 
 .. ocv:function:: void ocl::pow(const oclMat &x, double p, oclMat &y)
 
-    :param x: The source array
+    :param x: the source array.
 
-    :param power: The exponent of power;The source floating-point array of angles of the 2D vectors
+    :param p: the exponent of power; the source floating-point array of angles of the 2D vectors.
 
-    :param y: The destination array, should be the same type as the source
+    :param y: the destination array, should be the same type as the source.
 
-The function pow raises every element of the input array to p. Supports only CV_32FC1 and CV_64FC1 data type.
+The function pow raises every element of the input array to ``p``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data types.
 
 ocl::transpose
 ------------------
@@ -423,26 +443,26 @@ Returns void
 
 .. ocv:function:: void ocl::transpose(const oclMat &src, oclMat &dst)
 
-    :param src: The source array
+    :param src: the source array.
 
-    :param dst: The destination array of the same type as src
+    :param dst: the destination array of the same type as ``src``.
 
-Transposes a matrix. Supports 8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1 data types.
+Transposes a matrix (in case when ``src`` == ``dst`` and matrix is square the operation are performed inplace)
 
 
 ocl::dft
 ------------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
 
-.. ocv:function:: void ocl::dft( const oclMat& src, oclMat& dst, Size dft_size=Size(0, 0), int flags=0 )
+.. ocv:function:: void ocl::dft(const oclMat& src, oclMat& dst, Size dft_size = Size(), int flags = 0)
 
-    :param src: Source matrix (real or complex).
+    :param src: source matrix (real or complex).
 
-    :param dst: Destination matrix (real or complex).
+    :param dst: destination matrix (real or complex).
 
-    :param dft_size: Size of original input, which is used for transformation from complex to real.
+    :param dft_size: size of original input, which is used for transformation from complex to real.
 
-    :param flags: Optional flags:
+    :param flags: optional flags:
 
         * **DFT_ROWS** transforms each individual row of the source matrix.
 
@@ -452,9 +472,9 @@ Performs a forward or inverse discrete Fourier transform (1D or 2D) of the float
 
         * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.
 
-Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
+Use to handle real matrices (``CV_32FC1``) and complex matrices in the interleaved format (``CV_32FC2``).
 
-The dft_size must be powers of 2, 3 and 5. Real to complex dft output is not the same with cpu version. real to complex and complex to real does not support DFT_ROWS
+The ``dft_size`` must be powers of ``2``, ``3`` and ``5``. Real to complex dft output is not the same with cpu version. Real to complex and complex to real does not support ``DFT_ROWS``.
 
 .. seealso:: :ocv:func:`dft`
 
@@ -464,22 +484,22 @@ Performs generalized matrix multiplication.
 
 .. ocv:function:: void ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha, const oclMat& src3, double beta, oclMat& dst, int flags = 0)
 
-    :param src1: First multiplied input matrix that should be ``CV_32FC1`` type.
+    :param src1: first multiplied input matrix that should be ``CV_32FC1`` type.
 
-    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
+    :param src2: second multiplied input matrix of the same type as ``src1``.
 
-    :param alpha: Weight of the matrix product.
+    :param alpha: weight of the matrix product.
 
-    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
+    :param src3: third optional delta matrix added to the matrix product. It should have the same type as ``src1`` and ``src2``.
 
-    :param beta: Weight of  ``src3`` .
+    :param beta: weight of ``src3``.
 
-    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
+    :param dst: destination matrix. It has the proper size and the same type as input matrices.
 
-    :param flags: Operation flags:
+    :param flags: operation flags:
 
-            * **GEMM_1_T** transpose  ``src1``
-            * **GEMM_2_T** transpose  ``src2``
+            * **GEMM_1_T** transpose ``src1``.
+            * **GEMM_2_T** transpose ``src2``.
 
 .. seealso:: :ocv:func:`gemm`
 
@@ -489,28 +509,29 @@ Returns void
 
 .. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)
 
-    :param keys:   The keys to be used as sorting indices.
+    :param keys: the keys to be used as sorting indices.
 
-    :param values: The array of values.
+    :param values: the array of values.
 
-    :param isGreaterThan: Determine sorting order.
+    :param isGreaterThan: determine sorting order.
 
     :param method: supported sorting methods:
-            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size
-            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys
-            * **SORT_MERGE**     merge sort
-            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``)
+
+            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size.
+            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys.
+            * **SORT_MERGE**     merge sort.
+            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``).
 
 Returns the sorted result of all the elements in values based on equivalent keys.
 
-The element unit in the values to be sorted is determined from the data type,
-i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
+The element unit in the values to be sorted is determined from the data type, i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
 
 Both keys and values will be sorted inplace.
 
-Keys needs to be a **single** channel `oclMat`.
+Keys needs to be a **single** channel ``oclMat``.
 
 Example::
+
     input -
     keys   = {2,    3,   1}   (CV_8UC1)
     values = {10,5, 4,3, 6,2} (CV_8UC2)
diff --git a/modules/ocl/doc/structures_and_utility_functions.rst b/modules/ocl/doc/structures_and_utility_functions.rst
index 3810d7e2d..c3c93ecbf 100644
--- a/modules/ocl/doc/structures_and_utility_functions.rst
+++ b/modules/ocl/doc/structures_and_utility_functions.rst
@@ -4,7 +4,7 @@ Data Structures and Utility Functions
 .. highlight:: cpp
 
 ocl::Info
----------
+-------------
 .. ocv:class:: ocl::Info
 
 this class should be maintained by the user and be passed to getDevice
@@ -42,7 +42,7 @@ Returns void
 If you call this function and set a valid path, the OCL module will save the compiled kernel to the address in the first time and reload the binary since that. It can save compilation time at the runtime.
 
 ocl::getoclContext
-------------------
+----------------------
 Returns the pointer to the opencl context
 
 .. ocv:function:: void* ocl::getoclContext()
diff --git a/modules/ocl/doc/video_analysis.rst b/modules/ocl/doc/video_analysis.rst
new file mode 100644
index 000000000..599c0f9b6
--- /dev/null
+++ b/modules/ocl/doc/video_analysis.rst
@@ -0,0 +1,570 @@
+Video Analysis
+=============================
+
+.. highlight:: cpp
+
+ocl::GoodFeaturesToTrackDetector_OCL
+----------------------------------------
+.. ocv:class:: ocl::GoodFeaturesToTrackDetector_OCL
+
+Class used for strong corners detection on an image. ::
+
+    class GoodFeaturesToTrackDetector_OCL
+    {
+    public:
+        explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
+            int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
+
+        //! return 1 rows matrix with CV_32FC2 type
+        void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
+        //! download points of type Point2f to a vector. the vector's content will be erased
+        void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
+
+        int maxCorners;
+        double qualityLevel;
+        double minDistance;
+
+        int blockSize;
+        bool useHarrisDetector;
+        double harrisK;
+        void releaseMemory()
+        {
+            Dx_.release();
+            Dy_.release();
+            eig_.release();
+            minMaxbuf_.release();
+            tmpCorners_.release();
+        }
+    };
+
+The class finds the most prominent corners in the image.
+
+.. seealso:: :ocv:func:`goodFeaturesToTrack()`
+
+ocl::GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL
+-------------------------------------------------------------------------
+Constructor.
+
+.. ocv:function:: ocl::GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0, int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04)
+
+    :param maxCorners: Maximum number of corners to return. If there are more corners than are found, the strongest of them is returned.
+
+    :param qualityLevel: Parameter characterizing the minimal accepted quality of image corners. The parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue (see  :ocv:func:`ocl::cornerMinEigenVal` ) or the Harris function response (see  :ocv:func:`ocl::cornerHarris` ). The corners with the quality measure less than the product are rejected. For example, if the best corner has the quality measure = 1500, and the  ``qualityLevel=0.01`` , then all the corners with the quality measure less than 15 are rejected.
+
+    :param minDistance: Minimum possible Euclidean distance between the returned corners.
+
+    :param blockSize: Size of an average block for computing a derivative covariation matrix over each pixel neighborhood. See  :ocv:func:`cornerEigenValsAndVecs` .
+
+    :param useHarrisDetector: Parameter indicating whether to use a Harris detector (see :ocv:func:`ocl::cornerHarris`) or :ocv:func:`ocl::cornerMinEigenVal`.
+
+    :param harrisK: Free parameter of the Harris detector.
+
+ocl::GoodFeaturesToTrackDetector_OCL::operator ()
+-------------------------------------------------------
+Finds the most prominent corners in the image.
+
+.. ocv:function:: void ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat())
+
+    :param image: Input 8-bit, single-channel image.
+
+    :param corners: Output vector of detected corners (it will be one row matrix with CV_32FC2 type).
+
+    :param mask: Optional region of interest. If the image is not empty (it needs to have the type  ``CV_8UC1``  and the same size as  ``image`` ), it  specifies the region in which the corners are detected.
+
+.. seealso:: :ocv:func:`goodFeaturesToTrack`
+
+ocl::GoodFeaturesToTrackDetector_OCL::releaseMemory
+--------------------------------------------------------
+Releases inner buffers memory.
+
+.. ocv:function:: void ocl::GoodFeaturesToTrackDetector_OCL::releaseMemory()
+
+ocl::FarnebackOpticalFlow
+-------------------------------
+.. ocv:class:: ocl::FarnebackOpticalFlow
+
+Class computing a dense optical flow using the Gunnar Farneback's algorithm. ::
+
+    class CV_EXPORTS FarnebackOpticalFlow
+    {
+    public:
+        FarnebackOpticalFlow();
+
+        int numLevels;
+        double pyrScale;
+        bool fastPyramids;
+        int winSize;
+        int numIters;
+        int polyN;
+        double polySigma;
+        int flags;
+
+        void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy);
+
+        void releaseMemory();
+
+    private:
+        /* hidden */
+    };
+
+ocl::FarnebackOpticalFlow::operator ()
+------------------------------------------
+Computes a dense optical flow using the Gunnar Farneback's algorithm.
+
+.. ocv:function:: void ocl::FarnebackOpticalFlow::operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy)
+
+    :param frame0: First 8-bit gray-scale input image
+    :param frame1: Second 8-bit gray-scale input image
+    :param flowx: Flow horizontal component
+    :param flowy: Flow vertical component
+    :param s: Stream
+
+.. seealso:: :ocv:func:`calcOpticalFlowFarneback`
+
+ocl::FarnebackOpticalFlow::releaseMemory
+--------------------------------------------
+Releases unused auxiliary memory buffers.
+
+.. ocv:function:: void ocl::FarnebackOpticalFlow::releaseMemory()
+
+
+ocl::PyrLKOpticalFlow
+-------------------------
+.. ocv:class:: ocl::PyrLKOpticalFlow
+
+Class used for calculating an optical flow. ::
+
+    class PyrLKOpticalFlow
+    {
+    public:
+        PyrLKOpticalFlow();
+
+        void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts,
+            oclMat& status, oclMat* err = 0);
+
+        void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0);
+
+        Size winSize;
+        int maxLevel;
+        int iters;
+        double derivLambda;
+        bool useInitialFlow;
+        float minEigThreshold;
+        bool getMinEigenVals;
+
+        void releaseMemory();
+
+    private:
+        /* hidden */
+    };
+
+The class can calculate an optical flow for a sparse feature set or dense optical flow using the iterative Lucas-Kanade method with pyramids.
+
+.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
+
+ocl::PyrLKOpticalFlow::sparse
+---------------------------------
+Calculate an optical flow for a sparse feature set.
+
+.. ocv:function:: void ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err = 0)
+
+    :param prevImg: First 8-bit input image (supports both grayscale and color images).
+
+    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
+
+    :param prevPts: Vector of 2D points for which the flow needs to be found. It must be one row matrix with CV_32FC2 type.
+
+    :param nextPts: Output vector of 2D points (with single-precision floating-point coordinates) containing the calculated new positions of input features in the second image. When ``useInitialFlow`` is true, the vector must have the same size as in the input.
+
+    :param status: Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the flow for the corresponding features has been found. Otherwise, it is set to 0.
+
+    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
+
+.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
+
+
+ocl::PyrLKOpticalFlow::dense
+---------------------------------
+Calculate dense optical flow.
+
+.. ocv:function:: void ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0)
+
+    :param prevImg: First 8-bit grayscale input image.
+
+    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
+
+    :param u: Horizontal component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
+
+    :param v: Vertical component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
+
+    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
+
+
+ocl::PyrLKOpticalFlow::releaseMemory
+----------------------------------------
+Releases inner buffers memory.
+
+.. ocv:function:: void ocl::PyrLKOpticalFlow::releaseMemory()
+
+ocl::interpolateFrames
+--------------------------
+Interpolates frames (images) using provided optical flow (displacement field).
+
+.. ocv:function:: void ocl::interpolateFrames(const oclMat& frame0, const oclMat& frame1, const oclMat& fu, const oclMat& fv, const oclMat& bu, const oclMat& bv, float pos, oclMat& newFrame, oclMat& buf)
+
+    :param frame0: First frame (32-bit floating point images, single channel).
+
+    :param frame1: Second frame. Must have the same type and size as ``frame0`` .
+
+    :param fu: Forward horizontal displacement.
+
+    :param fv: Forward vertical displacement.
+
+    :param bu: Backward horizontal displacement.
+
+    :param bv: Backward vertical displacement.
+
+    :param pos: New frame position.
+
+    :param newFrame: Output image.
+
+    :param buf: Temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat: occlusion masks for first frame, occlusion masks for second, interpolated forward horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow, interpolated backward vertical flow.
+
+    :param stream: Stream for the asynchronous version.
+
+ocl::KalmanFilter
+--------------------
+.. ocv:class:: ocl::KalmanFilter
+
+Kalman filter class. ::
+
+    class CV_EXPORTS KalmanFilter
+    {
+    public:
+        KalmanFilter();
+        //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
+        KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+        //! re-initializes Kalman filter. The previous content is destroyed.
+        void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+
+        const oclMat& predict(const oclMat& control=oclMat());
+        const oclMat& correct(const oclMat& measurement);
+
+        oclMat statePre; //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+        oclMat statePost; //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+        oclMat transitionMatrix; //!< state transition matrix (A)
+        oclMat controlMatrix; //!< control matrix (B) (not used if there is no control)
+        oclMat measurementMatrix; //!< measurement matrix (H)
+        oclMat processNoiseCov; //!< process noise covariance matrix (Q)
+        oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
+        oclMat errorCovPre; //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
+        oclMat gain; //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
+        oclMat errorCovPost; //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+    private:
+        /* hidden */
+    };
+
+ocl::KalmanFilter::KalmanFilter
+----------------------------------
+The constructors.
+
+.. ocv:function:: ocl::KalmanFilter::KalmanFilter()
+
+.. ocv:function:: ocl::KalmanFilter::KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F)
+
+    The full constructor.
+
+    :param dynamParams: Dimensionality of the state.
+
+    :param measureParams: Dimensionality of the measurement.
+
+    :param controlParams: Dimensionality of the control vector.
+
+    :param type: Type of the created matrices that should be ``CV_32F`` or ``CV_64F``.
+
+
+ocl::KalmanFilter::init
+---------------------------
+Re-initializes Kalman filter. The previous content is destroyed.
+
+.. ocv:function:: void ocl::KalmanFilter::init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F)
+
+    :param dynamParams: Dimensionalityensionality of the state.
+
+    :param measureParams: Dimensionality of the measurement.
+
+    :param controlParams: Dimensionality of the control vector.
+
+    :param type: Type of the created matrices that should be ``CV_32F`` or ``CV_64F``.
+
+
+ocl::KalmanFilter::predict
+------------------------------
+Computes a predicted state.
+
+.. ocv:function:: const oclMat& ocl::KalmanFilter::predict(const oclMat& control=oclMat())
+
+    :param control: The optional input control
+
+
+ocl::KalmanFilter::correct
+-----------------------------
+Updates the predicted state from the measurement.
+
+.. ocv:function:: const oclMat& ocl::KalmanFilter::correct(const oclMat& measurement)
+
+    :param measurement: The measured system parameters
+
+
+ocl::BackgroundSubtractor
+----------------------------
+.. ocv:class:: ocl::BackgroundSubtractor
+
+Base class for background/foreground segmentation. ::
+
+    class CV_EXPORTS BackgroundSubtractor
+    {
+    public:
+        //! the virtual destructor
+        virtual ~BackgroundSubtractor();
+        //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image.
+        virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate);
+
+        //! computes a background image
+        virtual void getBackgroundImage(oclMat& backgroundImage) const = 0;
+    };
+
+
+The class is only used to define the common interface for the whole family of background/foreground segmentation algorithms.
+
+
+ocl::BackgroundSubtractor::operator()
+-----------------------------------------
+Computes a foreground mask.
+
+.. ocv:function:: void ocl::BackgroundSubtractor::operator()(const oclMat& image, oclMat& fgmask, float learningRate)
+
+    :param image: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+
+ocl::BackgroundSubtractor::getBackgroundImage
+-------------------------------------------------
+Computes a background image.
+
+.. ocv:function:: void ocl::BackgroundSubtractor::getBackgroundImage(oclMat& backgroundImage) const
+
+    :param backgroundImage: The output background image.
+
+.. note:: Sometimes the background image can be very blurry, as it contain the average background statistics.
+
+ocl::MOG
+------------
+.. ocv:class:: ocl::MOG : public ocl::BackgroundSubtractor
+
+Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm. ::
+
+    class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor
+    {
+    public:
+        //! the default constructor
+        MOG(int nmixtures = -1);
+
+        //! re-initiaization method
+        void initialize(Size frameSize, int frameType);
+
+        //! the update operator
+        void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f);
+
+        //! computes a background image which are the mean of all background gaussians
+        void getBackgroundImage(oclMat& backgroundImage) const;
+
+        //! releases all inner buffers
+        void release();
+
+        int history;
+        float varThreshold;
+        float backgroundRatio;
+        float noiseSigma;
+
+    private:
+        /* hidden */
+    };
+
+The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2001]_.
+
+.. seealso:: :ocv:class:`BackgroundSubtractorMOG`
+
+
+ocl::MOG::MOG
+---------------------
+The constructor.
+
+.. ocv:function:: ocl::MOG::MOG(int nmixtures = -1)
+
+    :param nmixtures: Number of Gaussian mixtures.
+
+Default constructor sets all parameters to default values.
+
+
+ocl::MOG::operator()
+------------------------
+Updates the background model and returns the foreground mask.
+
+.. ocv:function:: void ocl::MOG::operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f)
+
+    :param frame: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG::getBackgroundImage
+--------------------------------
+Computes a background image.
+
+.. ocv:function:: void ocl::MOG::getBackgroundImage(oclMat& backgroundImage) const
+
+    :param backgroundImage: The output background image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG::release
+---------------------
+Releases all inner buffer's memory.
+
+.. ocv:function:: void ocl::MOG::release()
+
+
+ocl::MOG2
+-------------
+.. ocv:class:: ocl::MOG2 : public ocl::BackgroundSubtractor
+
+Gaussian Mixture-based Background/Foreground Segmentation Algorithm. ::
+
+    class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor
+    {
+    public:
+        //! the default constructor
+        MOG2(int nmixtures = -1);
+
+        //! re-initiaization method
+        void initialize(Size frameSize, int frameType);
+
+        //! the update operator
+        void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f);
+
+        //! computes a background image which are the mean of all background gaussians
+        void getBackgroundImage(oclMat& backgroundImage) const;
+
+        //! releases all inner buffers
+        void release();
+
+        int history;
+
+        float varThreshold;
+
+        float backgroundRatio;
+
+        float varThresholdGen;
+
+        float fVarInit;
+        float fVarMin;
+        float fVarMax;
+
+        float fCT;
+
+        bool bShadowDetection;
+        unsigned char nShadowDetection;
+        float fTau;
+
+    private:
+        /* hidden */
+    };
+
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2004]_.
+
+  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+
+    .. ocv:member:: float backgroundRatio
+
+        Threshold defining whether the component is significant enough to be included into the background model. ``cf=0.1 => TB=0.9`` is default. For ``alpha=0.001``, it means that the mode should exist for approximately 105 frames before it is considered foreground.
+
+    .. ocv:member:: float varThreshold
+
+        Threshold for the squared Mahalanobis distance that helps decide when a sample is close to the existing components (corresponds to ``Tg``). If it is not close to any component, a new component is generated. ``3 sigma => Tg=3*3=9`` is default. A smaller ``Tg`` value generates more components. A higher ``Tg`` value may result in a small number of components but they can grow too large.
+
+    .. ocv:member:: float fVarInit
+
+        Initial variance for the newly generated components. It affects the speed of adaptation. The parameter value is based on your estimate of the typical standard deviation from the images. OpenCV uses 15 as a reasonable value.
+
+    .. ocv:member:: float fVarMin
+
+        Parameter used to further control the variance.
+
+    .. ocv:member:: float fVarMax
+
+        Parameter used to further control the variance.
+
+    .. ocv:member:: float fCT
+
+        Complexity reduction parameter. This parameter defines the number of samples needed to accept to prove the component exists. ``CT=0.05`` is a default value for all the samples. By setting ``CT=0`` you get an algorithm very similar to the standard Stauffer&Grimson algorithm.
+
+    .. ocv:member:: uchar nShadowDetection
+
+        The value for marking shadow pixels in the output foreground mask. Default value is 127.
+
+    .. ocv:member:: float fTau
+
+        Shadow threshold. The shadow is detected if the pixel is a darker version of the background. ``Tau`` is a threshold defining how much darker the shadow can be. ``Tau= 0.5`` means that if a pixel is more than twice darker then it is not shadow. See [ShadowDetect2003]_.
+
+    .. ocv:member:: bool bShadowDetection
+
+        Parameter defining whether shadow detection should be enabled.
+
+.. seealso:: :ocv:class:`BackgroundSubtractorMOG2`
+
+
+ocl::MOG2::MOG2
+-----------------------
+The constructor.
+
+.. ocv:function:: ocl::MOG2::MOG2(int nmixtures = -1)
+
+    :param nmixtures: Number of Gaussian mixtures.
+
+Default constructor sets all parameters to default values.
+
+
+ocl::MOG2::operator()
+-------------------------
+Updates the background model and returns the foreground mask.
+
+.. ocv:function:: void ocl::MOG2::operator()( const oclMat& frame, oclMat& fgmask, float learningRate=-1.0f)
+
+    :param frame: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG2::getBackgroundImage
+---------------------------------
+Computes a background image.
+
+.. ocv:function:: void ocl::MOG2::getBackgroundImage(oclMat& backgroundImage) const
+
+    :param backgroundImage: The output background image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+ocl::MOG2::release
+----------------------
+Releases all inner buffer's memory.
+
+.. ocv:function:: void ocl::MOG2::release()
\ No newline at end of file
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index fc7c114d9..5591b3f64 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -50,6 +50,7 @@
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
+#include "opencv2/ml.hpp"
 
 namespace cv
 {
@@ -153,8 +154,8 @@ namespace cv
             static void setContext(Info &oclinfo);
 
             enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
-            bool supportsFeature(int ftype);
-            size_t computeUnits();
+            bool supportsFeature(int ftype) const;
+            size_t computeUnits() const;
             size_t maxWorkGroupSize();
             void* oclContext();
             void* oclCommandQueue();
@@ -264,13 +265,12 @@ namespace cv
 
             //! returns deep copy of the oclMatrix, i.e. the data is copied
             oclMat clone() const;
-            //! copies the oclMatrix content to "m".
+
+            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
             // It calls m.create(this->size(), this->type()).
             // It supports any data type
-            void copyTo( oclMat &m ) const;
-            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
-            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
-            void copyTo( oclMat &m, const oclMat &mask ) const;
+            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
+
             //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
             //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
             void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
@@ -407,61 +407,52 @@ namespace cv
         CV_EXPORTS void split(const oclMat &src, std::vector<oclMat> &dst);
 
         ////////////////////////////// Arithmetics ///////////////////////////////////
-        //#if defined DOUBLE_SUPPORT
-        //typedef double F;
-        //#else
-        //typedef float F;
-        //#endif
-        //	CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
-        CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
-        //! adds scalar to a matrix (c = a + s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c);
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat());
-        //! computes element-wise product of the two arrays (c = a * b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
-        //! multiplies matrix to a number (dst = scalar * src)
-        // supports CV_32FC1 only
-        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(double scale, const oclMat &b, oclMat &c);
 
-        //! compares elements of two arrays (c = a <cmpop> b)
-        // supports except CV_8SC1,CV_8SC2,CV8SC3,CV_8SC4 types
-        CV_EXPORTS void compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop);
+        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
+        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
+
+        //! adds one matrix to another (dst = src1 + src2)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! adds scalar to a matrix (dst = src1 + s)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! subtracts one matrix from another (dst = src1 - src2)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! subtracts scalar from a matrix (dst = src1 - s)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
+        // supports all data types
+        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
+        //! multiplies matrix to a number (dst = scalar * src)
+        // supports all data types
+        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
+
+        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
+        // supports all data types
+        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
+        //! computes element-wise quotient of the two arrays (dst = scale / src)
+        // supports all data types
+        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
+
+        //! compares elements of two arrays (dst = src1 <cmpop> src2)
+        // supports all data types
+        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);
 
         //! transposes the matrix
-        // supports  CV_8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1.(the same as cuda)
+        // supports all data types
         CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);
 
-        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const oclMat &b, oclMat &c);
-        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const Scalar &s, oclMat &c);
+        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
+        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);
 
         //! computes mean value and standard deviation of all or selected array elements
         // supports except CV_32F,CV_64F
@@ -479,7 +470,7 @@ namespace cv
 
         //! reverses the order of the rows, columns or both in a matrix
         // supports all types
-        CV_EXPORTS void flip(const oclMat &a, oclMat &b, int flipCode);
+        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);
 
         //! computes sum of array elements
         // disabled until fix crash
@@ -490,13 +481,11 @@ namespace cv
 
         //! finds global minimum and maximum array elements and returns their values
         // support all C1 types
-
         CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
         CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);
 
         //! finds global minimum and maximum array elements and returns their values with locations
         // support all C1 types
-
         CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
                                   const oclMat &mask = oclMat());
 
@@ -525,30 +514,27 @@ namespace cv
         //  This is not truly a bilateral filter. Instead of using user provided fixed parameters,
         //  the function calculates a constant at each window based on local standard deviation,
         //  and use this constant to do filtering.
-        //  supports 8UC1 8UC3
+        //  supports 8UC1, 8UC3
         CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);
 
-        //! computes exponent of each matrix element (b = e**a)
-        // supports only CV_32FC1 type
-        CV_EXPORTS void exp(const oclMat &a, oclMat &b);
+        //! computes exponent of each matrix element (dst = e**src)
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);
 
-        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-        // supports only CV_32FC1 type
-        CV_EXPORTS void log(const oclMat &a, oclMat &b);
+        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void log(const oclMat &src, oclMat &dst);
 
         //! computes magnitude of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
         CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, const oclMat &y, oclMat &magnitude);
-
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, oclMat &magnitude);
 
         //! computes angle (angle(i)) of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
         CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
 
         //! the function raises every element of tne input array to p
-        //! support only CV_32F CV_64F type
+        // support only CV_32F, CV_64F type
         CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
 
         //! converts Cartesian coordinates to polar
@@ -562,14 +548,17 @@ namespace cv
         //! perfroms per-elements bit-wise inversion
         // supports all types
         CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
+
         //! calculates per-element bit-wise disjunction of two arrays
         // supports all types
         CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
         CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
         //! calculates per-element bit-wise conjunction of two arrays
         // supports all types
         CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
         CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
         //! calculates per-element bit-wise "exclusive or" operation
         // supports all types
         CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
@@ -603,7 +592,7 @@ namespace cv
         };
 
         //! computes convolution of two images, may use discrete Fourier transform
-        //! support only CV_32FC1 type
+        // support only CV_32FC1 type
         CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
         CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
 
@@ -614,6 +603,8 @@ namespace cv
 
         CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
 
+        CV_EXPORTS void setIdentity(oclMat& src, double val);
+
         //////////////////////////////// Filter Engine ////////////////////////////////
 
         /*!
@@ -982,7 +973,7 @@ namespace cv
         // real to complex dft requires at least v1.8 clAmdFft
         // real to complex dft output is not the same with cpu version
         // real to complex and complex to real does not support DFT_ROWS
-        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
+        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);
 
         //! implements generalized matrix product algorithm GEMM from BLAS
         // The functionality requires clAmdBlas library
@@ -1954,6 +1945,80 @@ namespace cv
 
             oclMat bgmodelUsedModes_; //keep track of number of modes per pixel
         };
+
+        /*!***************Kalman Filter*************!*/
+        class CV_EXPORTS KalmanFilter
+        {
+        public:
+            KalmanFilter();
+            //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
+            KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+            //! re-initializes Kalman filter. The previous content is destroyed.
+            void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
+
+            const oclMat& predict(const oclMat& control=oclMat());
+            const oclMat& correct(const oclMat& measurement);
+
+            oclMat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+            oclMat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+            oclMat transitionMatrix;   //!< state transition matrix (A)
+            oclMat controlMatrix;      //!< control matrix (B) (not used if there is no control)
+            oclMat measurementMatrix;  //!< measurement matrix (H)
+            oclMat processNoiseCov;    //!< process noise covariance matrix (Q)
+            oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
+            oclMat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
+            oclMat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
+            oclMat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+        private:
+            oclMat temp1;
+            oclMat temp2;
+            oclMat temp3;
+            oclMat temp4;
+            oclMat temp5;
+        };
+
+        static inline size_t divUp(size_t total, size_t grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        /*!***************K Nearest Neighbour*************!*/
+        class CV_EXPORTS KNearestNeighbour: public CvKNearest
+        {
+        public:
+            KNearestNeighbour();
+            ~KNearestNeighbour();
+
+            bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
+                bool isRegression = false, int max_k = 32, bool updateBase = false);
+
+            void clear();
+
+            void find_nearest(const oclMat& samples, int k, oclMat& lables);
+
+        private:
+            oclMat samples_ocl;
+        };
+        /*!***************  SVM  *************!*/
+        class CV_EXPORTS CvSVM_OCL : public CvSVM
+        {
+        public:
+            CvSVM_OCL();
+
+            CvSVM_OCL(const cv::Mat& trainData, const cv::Mat& responses,
+                      const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
+                      CvSVMParams params=CvSVMParams());
+            CV_WRAP float predict( const int row_index, Mat& src, bool returnDFVal=false ) const;
+            CV_WRAP void predict( cv::InputArray samples, cv::OutputArray results ) const;
+            CV_WRAP float predict( const cv::Mat& sample, bool returnDFVal=false ) const;
+            float predict( const CvMat* samples, CV_OUT CvMat* results ) const;
+
+        protected:
+            float predict( const int row_index, int row_len, Mat& src, bool returnDFVal=false ) const;
+            void create_kernel();
+            void create_solver();
+        };
+        /*!***************  END  *************!*/
     }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index f8172e0c2..6f7c4bc5d 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -174,6 +174,8 @@ namespace cv
         size_t CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel);
         template<>
         bool CV_EXPORTS queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel kernel);
+
+        unsigned long CV_EXPORTS queryLocalMemInfo();
     }//namespace ocl
 
 }//namespace cv
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index 814b272f0..d718ed551 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -842,54 +842,6 @@ PERF_TEST_P(PowFixture, pow, OCL_TYPICAL_MAT_SIZES)
         OCL_PERF_ELSE
 }
 
-///////////// MagnitudeSqr////////////////////////
-
-typedef TestBaseWithParam<Size> MagnitudeSqrFixture;
-
-PERF_TEST_P(MagnitudeSqrFixture, MagnitudeSqr, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::magnitudeSqr(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        ASSERT_EQ(1, src1.channels());
-
-        TEST_CYCLE()
-        {
-            for (int y = 0; y < srcSize.height; ++y)
-            {
-                const float * const src1Data = reinterpret_cast<float *>(src1.data + src1.step * y);
-                const float * const src2Data = reinterpret_cast<float *>(src2.data + src2.step * y);
-                float * const dstData = reinterpret_cast<float *>(dst.data + dst.step * y);
-                for (int x = 0; x < srcSize.width; ++x)
-                {
-                    float t0 = src1Data[x] * src1Data[x];
-                    float t1 = src2Data[x] * src2Data[x];
-                    dstData[x] = t0 + t1;
-                }
-            }
-        }
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
 ///////////// AddWeighted////////////////////////
 
 typedef Size_MatType AddWeightedFixture;
diff --git a/modules/ocl/perf/perf_bgfg.cpp b/modules/ocl/perf/perf_bgfg.cpp
index 9ccd1657e..40628d9bb 100644
--- a/modules/ocl/perf/perf_bgfg.cpp
+++ b/modules/ocl/perf/perf_bgfg.cpp
@@ -44,12 +44,14 @@
 //
 //M*/
 #include "perf_precomp.hpp"
+
 using namespace perf;
 using namespace std;
 using namespace cv::ocl;
 using namespace cv;
 using std::tr1::tuple;
 using std::tr1::get;
+
 #if defined(HAVE_XINE)         || \
     defined(HAVE_GSTREAMER)    || \
     defined(HAVE_QUICKTIME)    || \
@@ -63,6 +65,7 @@ using std::tr1::get;
 #endif
 
 #if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 static void cvtFrameFmt(vector<Mat>& input, vector<Mat>& output)
 {
     for(int i = 0; i< (int)(input.size()); i++)
@@ -70,6 +73,7 @@ static void cvtFrameFmt(vector<Mat>& input, vector<Mat>& output)
         cvtColor(input[i], output[i], COLOR_RGB2GRAY);
     }
 }
+
 //prepare data for CPU
 static void prepareData(VideoCapture& cap, int cn, vector<Mat>& frame_buffer)
 {
@@ -88,15 +92,15 @@ static void prepareData(VideoCapture& cap, int cn, vector<Mat>& frame_buffer)
     else
         frame_buffer = frame_buffer_init;
 }
+
 //copy CPU data to GPU
 static void prepareData(vector<Mat>& frame_buffer, vector<oclMat>& frame_buffer_ocl)
 {
     for(int i = 0; i < (int)frame_buffer.size(); i++)
         frame_buffer_ocl.push_back(cv::ocl::oclMat(frame_buffer[i]));
 }
-#endif
+
 ///////////// MOG ////////////////////////
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
 
 typedef tuple<string, int, double> VideoMOGParamType;
 typedef TestBaseWithParam<VideoMOGParamType> VideoMOGFixture;
@@ -137,7 +141,8 @@ PERF_TEST_P(VideoMOGFixture, MOG,
             }
         }
         SANITY_CHECK(foreground);
-    }else if(RUN_OCL_IMPL)
+    }
+    else if(RUN_OCL_IMPL)
     {
         prepareData(frame_buffer, frame_buffer_ocl);
         CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
@@ -152,13 +157,12 @@ PERF_TEST_P(VideoMOGFixture, MOG,
         }
         foreground_d.download(foreground);
         SANITY_CHECK(foreground);
-    }else
+    }
+    else
         OCL_PERF_ELSE
 }
-#endif
 
 ///////////// MOG2 ////////////////////////
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
 
 typedef tuple<string, int> VideoMOG2ParamType;
 typedef TestBaseWithParam<VideoMOG2ParamType> VideoMOG2Fixture;
@@ -196,7 +200,8 @@ PERF_TEST_P(VideoMOG2Fixture, MOG2,
             }
         }
         SANITY_CHECK(foreground);
-    }else if(RUN_OCL_IMPL)
+    }
+    else if(RUN_OCL_IMPL)
     {
         prepareData(frame_buffer, frame_buffer_ocl);
         CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
@@ -211,13 +216,12 @@ PERF_TEST_P(VideoMOG2Fixture, MOG2,
         }
         foreground_d.download(foreground);
         SANITY_CHECK(foreground);
-    }else
+    }
+    else
         OCL_PERF_ELSE
 }
-#endif
 
 ///////////// MOG2_GetBackgroundImage //////////////////
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
 
 typedef TestBaseWithParam<VideoMOG2ParamType> Video_MOG2GetBackgroundImage;
 
@@ -259,7 +263,8 @@ PERF_TEST_P(Video_MOG2GetBackgroundImage, MOG2,
             mog2->getBackgroundImage(background);
         }
         SANITY_CHECK(background);
-    }else if(RUN_OCL_IMPL)
+    }
+    else if(RUN_OCL_IMPL)
     {
         prepareData(frame_buffer, frame_buffer_ocl);
         CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
@@ -276,7 +281,9 @@ PERF_TEST_P(Video_MOG2GetBackgroundImage, MOG2,
         }
         background_d.download(background);
         SANITY_CHECK(background);
-    }else
+    }
+    else
         OCL_PERF_ELSE
 }
+
 #endif
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index aa562412b..7f2758877 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -333,13 +333,13 @@ PERF_TEST_P(BilateralFixture, Bilateral,
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params), d = 7;
-    double sigmacolor = 50.0, sigmaspace = 50.0;
+    const double sigmacolor = 50.0, sigmaspace = 50.0;
 
     Mat src(srcSize, type), dst(srcSize, type);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    if (srcSize == OCL_SIZE_4000 && type == CV_8UC3)
-        declare.time(8);
+    if (srcSize == OCL_SIZE_4000)
+        declare.time(type == CV_8UC3 ? 8 : 4.5);
 
     if (RUN_OCL_IMPL)
     {
@@ -372,14 +372,16 @@ PERF_TEST_P(adaptiveBilateralFixture, adaptiveBilateral,
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
-    double sigmaspace = 10.0;
-    Size ksize(9,9);
+    const double sigmaspace = 10.0;
+    Size ksize(9, 9);
 
     Mat src(srcSize, type), dst(srcSize, type);
     declare.in(src, WARMUP_RNG).out(dst);
 
     if (srcSize == OCL_SIZE_4000)
-        declare.time(15);
+        declare.time(type == CV_8UC3 ? 46 : 28);
+    else if (srcSize == OCL_SIZE_2000)
+        declare.time(type == CV_8UC3 ? 11 : 7);
 
     if (RUN_OCL_IMPL)
     {
@@ -389,7 +391,7 @@ PERF_TEST_P(adaptiveBilateralFixture, adaptiveBilateral,
 
         oclDst.download(dst);
 
-        SANITY_CHECK(dst, 1.);
+        SANITY_CHECK(dst, 1.0);
     }
     else if (RUN_PLAIN_IMPL)
     {
diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/ocl/perf/perf_hog.cpp
index 15846d831..fe5d9d190 100644
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -49,6 +49,23 @@ using namespace perf;
 
 ///////////// HOG////////////////////////
 
+struct RectLess :
+        public std::binary_function<cv::Rect, cv::Rect, bool>
+{
+    bool operator()(const cv::Rect& a,
+        const cv::Rect& b) const
+    {
+        if (a.x != b.x)
+            return a.x < b.x;
+        else if (a.y != b.y)
+            return a.y < b.y;
+        else if (a.width != b.width)
+            return a.width < b.width;
+        else
+            return a.height < b.height;
+    }
+};
+
 PERF_TEST(HOGFixture, HOG)
 {
     Mat src = imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE);
@@ -64,6 +81,7 @@ PERF_TEST(HOGFixture, HOG)
 
         TEST_CYCLE() hog.detectMultiScale(src, found_locations);
 
+        std::sort(found_locations.begin(), found_locations.end(), RectLess());
         SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
     }
     else if (RUN_OCL_IMPL)
@@ -74,6 +92,7 @@ PERF_TEST(HOGFixture, HOG)
 
         OCL_TEST_CYCLE() ocl_hog.detectMultiScale(oclSrc, found_locations);
 
+        std::sort(found_locations.begin(), found_locations.end(), RectLess());
         SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
     }
     else
diff --git a/modules/ocl/perf/perf_kalman.cpp b/modules/ocl/perf/perf_kalman.cpp
new file mode 100644
index 000000000..b5f713be9
--- /dev/null
+++ b/modules/ocl/perf/perf_kalman.cpp
@@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "perf_precomp.hpp"
+using namespace perf;
+using namespace std;
+using namespace cv::ocl;
+using namespace cv;
+using std::tr1::tuple;
+using std::tr1::get;
+///////////// Kalman Filter ////////////////////////
+
+typedef tuple<int> KalmanFilterType;
+typedef TestBaseWithParam<KalmanFilterType> KalmanFilterFixture;
+
+PERF_TEST_P(KalmanFilterFixture, KalmanFilter,
+            ::testing::Values(1000, 1500))
+{
+    KalmanFilterType params = GetParam();
+    const int dim = get<0>(params);
+
+    cv::Mat sample(dim, 1, CV_32FC1), dresult;
+    randu(sample, -1, 1);
+
+    cv::Mat statePre_;
+
+    if(RUN_PLAIN_IMPL)
+    {
+        cv::KalmanFilter kalman;
+        TEST_CYCLE()
+        {
+            kalman.init(dim, dim);
+            kalman.correct(sample);
+            kalman.predict();
+        }
+        statePre_ = kalman.statePre;
+    }else if(RUN_OCL_IMPL)
+    {
+        cv::ocl::oclMat dsample(sample);
+        cv::ocl::KalmanFilter kalman_ocl;
+        OCL_TEST_CYCLE()
+        {
+            kalman_ocl.init(dim, dim);
+            kalman_ocl.correct(dsample);
+            kalman_ocl.predict();
+        }
+        kalman_ocl.statePre.download(statePre_);
+    }else
+        OCL_PERF_ELSE
+    SANITY_CHECK(statePre_);
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp
index ad1327503..13ce47a1b 100644
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -155,3 +155,78 @@ PERF_TEST_P(setToFixture, setTo,
     else
         OCL_PERF_ELSE
 }
+
+/////////////////// upload ///////////////////////////
+
+typedef tuple<Size, int, int> uploadParams;
+typedef TestBaseWithParam<uploadParams> uploadFixture;
+
+PERF_TEST_P(uploadFixture, DISABLED_upload,
+            testing::Combine(
+                OCL_TYPICAL_MAT_SIZES,
+                testing::Range(CV_8U, CV_64F),
+                testing::Range(1, 5)))
+{
+    const uploadParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), cn = get<2>(params);
+    const int type = CV_MAKE_TYPE(depth, cn);
+
+    Mat src(srcSize, type), dst;
+    declare.in(src, WARMUP_RNG);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclDst;
+
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), oclDst.release())
+            oclDst.upload(src);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
+            dst = src.clone();
+    }
+    else
+        OCL_PERF_ELSE
+
+    int value = 0;
+    SANITY_CHECK(value);
+}
+
+/////////////////// download ///////////////////////////
+
+typedef TestBaseWithParam<uploadParams> downloadFixture;
+
+PERF_TEST_P(downloadFixture, DISABLED_download,
+            testing::Combine(
+                OCL_TYPICAL_MAT_SIZES,
+                testing::Range(CV_8U, CV_64F),
+                testing::Range(1, 5)))
+{
+    const uploadParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), cn = get<2>(params);
+    const int type = CV_MAKE_TYPE(depth, cn);
+
+    Mat src(srcSize, type), dst;
+    declare.in(src, WARMUP_RNG);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src);
+
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
+            oclSrc.download(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
+            dst = src.clone();
+    }
+    else
+        OCL_PERF_ELSE
+
+    int value = 0;
+    SANITY_CHECK(value);
+}
diff --git a/modules/ocl/perf/perf_ml.cpp b/modules/ocl/perf/perf_ml.cpp
new file mode 100644
index 000000000..fac471ed4
--- /dev/null
+++ b/modules/ocl/perf/perf_ml.cpp
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma, jin@multicorewareinc.com
+//    Xiaopeng Fu, fuxiaopeng2222@163.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "perf_precomp.hpp"
+using namespace perf;
+using namespace std;
+using namespace cv::ocl;
+using namespace cv;
+using std::tr1::tuple;
+using std::tr1::get;
+////////////////////////////////// K-NEAREST NEIGHBOR ////////////////////////////////////
+static void genData(Mat& trainData, Size size, Mat& trainLabel = Mat().setTo(Scalar::all(0)), int nClasses = 0)
+{
+    trainData.create(size, CV_32FC1);
+    randu(trainData, 1.0, 100.0);
+
+    if(nClasses != 0)
+    {
+        trainLabel.create(size.height, 1, CV_8UC1);
+        randu(trainLabel, 0, nClasses - 1);
+        trainLabel.convertTo(trainLabel, CV_32FC1);
+    }
+}
+
+typedef tuple<int> KNNParamType;
+typedef TestBaseWithParam<KNNParamType> KNNFixture;
+
+PERF_TEST_P(KNNFixture, KNN,
+            testing::Values(1000, 2000, 4000))
+{
+    KNNParamType params = GetParam();
+    const int rows = get<0>(params);
+    int columns = 100;
+    int k = rows/250;
+
+    Mat trainData, trainLabels;
+    Size size(columns, rows);
+    genData(trainData, size, trainLabels, 3);
+
+    Mat testData;
+    genData(testData, size);
+    Mat best_label;
+
+    if(RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE()
+        {
+            CvKNearest knn_cpu;
+            knn_cpu.train(trainData, trainLabels);
+            knn_cpu.find_nearest(testData, k, &best_label);
+        }
+    }else if(RUN_OCL_IMPL)
+    {
+        cv::ocl::oclMat best_label_ocl;
+        cv::ocl::oclMat testdata;
+        testdata.upload(testData);
+
+        OCL_TEST_CYCLE()
+        {
+            cv::ocl::KNearestNeighbour knn_ocl;
+            knn_ocl.train(trainData, trainLabels);
+            knn_ocl.find_nearest(testdata, k, best_label_ocl);
+        }
+        best_label_ocl.download(best_label);
+    }else
+        OCL_PERF_ELSE
+    SANITY_CHECK(best_label);
+}
\ No newline at end of file
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index c9ee6f9f6..94ddd5631 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -61,11 +61,10 @@ namespace cv
 {
     namespace ocl
     {
-        ////////////////////////////////OpenCL kernel strings/////////////////////
-        extern const char *transpose_kernel;
+        //////////////////////////////// OpenCL kernel strings /////////////////////
+
         extern const char *arithm_nonzero;
         extern const char *arithm_sum;
-        extern const char *arithm_2_mat;
         extern const char *arithm_sum_3;
         extern const char *arithm_minMax;
         extern const char *arithm_minMax_mask;
@@ -73,6 +72,7 @@ namespace cv
         extern const char *arithm_minMaxLoc_mask;
         extern const char *arithm_LUT;
         extern const char *arithm_add;
+        extern const char *arithm_add_mask;
         extern const char *arithm_add_scalar;
         extern const char *arithm_add_scalar_mask;
         extern const char *arithm_bitwise_binary;
@@ -80,11 +80,7 @@ namespace cv
         extern const char *arithm_bitwise_binary_scalar;
         extern const char *arithm_bitwise_binary_scalar_mask;
         extern const char *arithm_bitwise_not;
-        extern const char *arithm_compare_eq;
-        extern const char *arithm_compare_ne;
-        extern const char *arithm_mul;
-        extern const char *arithm_div;
-        extern const char *arithm_absdiff;
+        extern const char *arithm_compare;
         extern const char *arithm_transpose;
         extern const char *arithm_flip;
         extern const char *arithm_flip_rc;
@@ -96,485 +92,225 @@ namespace cv
         extern const char *arithm_addWeighted;
         extern const char *arithm_phase;
         extern const char *arithm_pow;
-        extern const char *arithm_magnitudeSqr;
-        //extern const char * jhp_transpose_kernel;
-        int64 kernelrealtotal = 0;
-        int64 kernelalltotal = 0;
-        int64 reducetotal = 0;
-        int64 downloadtotal = 0;
-        int64 alltotal = 0;
+        extern const char *arithm_setidentity;
     }
 }
 
-//////////////////////////////////////////////////////////////////////////
-//////////////////common/////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-template<typename T>
-void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
-                    String kernelName, const char **kernelString, void *_scalar, int op_type = 0)
+
+enum { ADD = 0, SUB, MUL, DIV, ABS_DIFF };
+
+static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const Scalar & scalar, const oclMat & mask,
+                            oclMat &dst, int op_type, bool use_scalar = false)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    Context *clCxt = src1.clCxt;
+    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F))
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows);
-
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    T scalar;
-    if(_scalar != NULL)
-    {
-        double scalar1 = *((double *)_scalar);
-        scalar = (T)scalar1;
-        args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
-    }
-    switch(op_type)
-    {
-        case MAT_ADD:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_ADD");
-            break;
-        case MAT_SUB:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, "-D ARITHM_SUB");
-            break;
-        default:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
-    }
-}
-static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
-                           String kernelName, const char **kernelString, int op_type = 0)
-{
-    arithmetic_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type);
-}
-static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask,
-                           String kernelName, const char **kernelString, int op_type = 0)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
+    CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
+    CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
+    CV_Assert(op_type >= ADD && op_type <= ABS_DIFF);
 
     dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows &&
-              src1.rows == mask.rows && src1.cols == mask.cols);
 
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-    CV_Assert(mask.type() == CV_8U);
+    int oclChannels = src1.oclchannels(), depth = src1.depth();
+    int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
+    int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
+    int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize();
+    int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
+    oclMat m;
 
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 
-    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
-        {2, 2, 1, 1, 1, 1, 1},
-        {4, 4, 2, 2 , 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
+    std::string kernelName = op_type == ABS_DIFF ? "arithm_absdiff" : "arithm_binary_op";
 
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const WTypeMap[] = { "short", "short", "int", "int", "int", "float", "double" };
+    const char operationsMap[] = { '+', '-', '*', '/', '-' };
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+    bool haveScalar = use_scalar || src2.empty();
 
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    int WDepth = depth;
+    if (haveScalar)
+        WDepth = hasDouble && WDepth == CV_64F ? CV_64F : CV_32F;
+    if (op_type == DIV)
+        WDepth = hasDouble ? CV_64F : CV_32F;
+    else if (op_type == MUL)
+        WDepth = hasDouble && (depth == CV_32S || depth == CV_64F) ? CV_64F : CV_32F;
+
+    std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D Operation=%c"
+                                      " -D convertToWT=convert_%s%s",
+                                      typeMap[depth], channelMap[oclChannels],
+                                      WTypeMap[WDepth], channelMap[oclChannels],
+                                      typeMap[depth], channelMap[oclChannels], (depth >= CV_32F ? "" : (depth == CV_32S ? "_rte" : "_sat_rte")),
+                                      operationsMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]);
 
-    int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
 
-    switch (op_type)
+    if (!src2.empty())
     {
-        case MAT_ADD:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD");
-            break;
-        case MAT_SUB:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB");
-            break;
-        default:
-            openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth);
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
+
+        kernelName += "_mat";
     }
+
+    if (haveScalar)
+    {
+        const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F };
+        m.create(1, 1, CV_MAKE_TYPE(WDepthMap[WDepth], oclChannels));
+        m.setTo(scalar);
+
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&m.data ));
+
+        kernelName += "_scalar";
+    }
+
+    if (!mask.empty())
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&maskstep1 ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&maskoffset1 ));
+
+        kernelName += "_mask";
+    }
+
+    if (op_type == DIV)
+        kernelName += "_div";
+
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
+
+    openCLExecuteKernel(clCxt, mask.empty() ?
+                            (!src2.empty() ? &arithm_add : &arithm_add_scalar) :
+                            (!src2.empty() ? &arithm_add_mask : &arithm_add_scalar_mask),
+                        kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
-void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_ADD);
-}
+
 void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_ADD);
-}
-
-void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run(src1, src2, dst, "arithm_add", &arithm_add, MAT_SUB);
-}
-void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    arithmetic_run(src1, src2, dst, mask, "arithm_add_with_mask", &arithm_add, MAT_SUB);
-}
-typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
-                           const char **kernelString, void *scalar);
-
-void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
-{
-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE) && (src1.depth() == CV_64F))
-        arithmetic_run<double>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
-    else
-        arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
-}
-
-void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
-{
-
-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-        arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
-    else
-        arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
-
-}
-template <typename WT , typename CL_WT>
-void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows &&
-              src1.type() == dst.type());
-
-    //CV_Assert(src1.depth() != CV_8S);
-
-    if(mask.data)
-    {
-        CV_Assert(mask.type() == CV_8U && src1.rows == mask.rows && src1.cols == mask.cols);
-    }
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    WT s[4] = { saturate_cast<WT>(src2.val[0]), saturate_cast<WT>(src2.val[1]),
-                saturate_cast<WT>(src2.val[2]), saturate_cast<WT>(src2.val[3])
-              };
-
-    int vector_lengths[4][7] = {{4, 0, 2, 2, 1, 1, 1},
-        {2, 0, 1, 1, 1, 1, 1},
-        {4, 0, 2, 2 , 1, 1, 1},
-        {1, 0, 1, 1, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src1.offset));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset));
-
-    if(mask.data)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset));
-    }
-    args.push_back( std::make_pair( sizeof(CL_WT) ,  (void *)&s ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step1 ));
-    if(isMatSubScalar != 0)
-        openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_SUB");
-    else
-        openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, "-D ARITHM_ADD");
-}
-
-static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, String kernelName, const char **kernelString, double scalar)
-{
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
-    {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-    CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
-
-    CV_Assert(src.type() == dst.type());
-    CV_Assert(src.depth() != CV_8S);
-
-    Context  *clCxt = src.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1},
-        {4, 0, 4, 4 , 1, 1, 1},
-        {4, 0, 4, 4, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    float f_scalar = (float)scalar;
-    if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&scalar ));
-    else
-    {
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&f_scalar));
-    }
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
-}
-
-typedef void (*ArithmeticFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar);
-
-
-static void arithmetic_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar)
-{
-    static ArithmeticFuncS tab[8] =
-    {
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<int, cl_int4>,
-        arithmetic_scalar_run<float, cl_float4>,
-        arithmetic_scalar_run<double, cl_double4>,
-        0
-    };
-    ArithmeticFuncS func = tab[src1.depth()];
-    if(func == 0)
-        cv::error(Error::StsBadArg, "Unsupported arithmetic operation", "", __FILE__, __LINE__);
-    func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar);
-}
-static void arithmetic_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString)
-{
-    arithmetic_scalar(src1, src2, dst, mask, kernelName, kernelString, 0);
+    arithmetic_run_generic(src1, src2, Scalar(), mask, dst, ADD);
 }
 
 void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
-    const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
+    arithmetic_run_generic(src1, oclMat(), src2, mask, dst, ADD);
+}
 
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString);
+void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
+{
+    arithmetic_run_generic(src1, src2, Scalar(), mask, dst, SUB);
 }
 
 void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
-    const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
-
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, 1);
+    arithmetic_run_generic(src1, oclMat(), src2, mask, dst, SUB);
 }
-void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, const oclMat &mask)
+
+void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
 {
-    String kernelName = mask.data ? "arithm_s_add_with_mask" : "arithm_s_add";
-    const char **kernelString = mask.data ? &arithm_add_scalar_mask : &arithm_add_scalar;
-
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
+    const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
+    arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, MUL, use_scalar);
 }
+
 void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
 {
-    String kernelName = "arithm_muls";
-    arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar);
+    arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, MUL);
 }
-void cv::ocl::divide(double scalar, const oclMat &src,  oclMat &dst)
-{
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
 
-    String kernelName =  "arithm_s_div";
-    arithmetic_scalar_run(src, dst, kernelName, &arithm_div, scalar);
+void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
+{
+    const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
+    arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, DIV, use_scalar);
 }
+
+void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
+{
+    arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, DIV);
+}
+
 //////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////  Absdiff ///////////////////////////////////
+///////////////////////////////// Absdiff ////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst)
 {
-    arithmetic_run(src1, src2, dst, "arithm_absdiff", &arithm_absdiff);
+    arithmetic_run_generic(src1, src2, Scalar(), oclMat(), dst, ABS_DIFF);
 }
+
 void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
 {
-    String kernelName = "arithm_s_absdiff";
-    oclMat mask;
-    arithmetic_scalar( src1, src2, dst, mask, kernelName, &arithm_absdiff);
+    arithmetic_run_generic(src1, oclMat(), src2, oclMat(), dst, ABS_DIFF);
 }
+
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////  compare ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString)
+
+static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp,
+                        String kernelName, const char **kernelString)
 {
-    dst.create(src1.size(), CV_8UC1);
-    CV_Assert(src1.oclchannels() == 1);
     CV_Assert(src1.type() == src2.type());
-    Context  *clCxt = src1.clCxt;
+    dst.create(src1.size(), CV_8UC1);
+    Context *clCxt = src1.clCxt;
+
     int depth = src1.depth();
-    int vector_lengths[7] = {4, 0, 4, 4, 4, 4, 4};
-    size_t vector_length = vector_lengths[depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols  + offset_cols, vector_length);
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-    int dst_step1 = dst.cols * dst.elemSize();
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+
+    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
+    int src2step1 = src2.step1(), src2offset1 = src2.offset / src2.elemSize1();
+    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
+    std::string buildOptions = format("-D T=%s -D Operation=%s", typeMap[depth], operationMap[cmpOp]);
+
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+
+    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
     }
-    String kernelName;
-    const char **kernelString = NULL;
-    switch( cmpOp )
-    {
-    case CMP_EQ:
-        kernelName = "arithm_compare_eq";
-        kernelString = &arithm_compare_eq;
-        break;
-    case CMP_GT:
-        kernelName = "arithm_compare_gt";
-        kernelString = &arithm_compare_eq;
-        break;
-    case CMP_GE:
-        kernelName = "arithm_compare_ge";
-        kernelString = &arithm_compare_eq;
-        break;
-    case CMP_NE:
-        kernelName = "arithm_compare_ne";
-        kernelString = &arithm_compare_ne;
-        break;
-    case CMP_LT:
-        kernelName = "arithm_compare_lt";
-        kernelString = &arithm_compare_ne;
-        break;
-    case CMP_LE:
-        kernelName = "arithm_compare_le";
-        kernelString = &arithm_compare_ne;
-        break;
-    default:
-        CV_Error(Error::StsBadArg, "Unknown comparison method");
-    }
-    compare_run(src1, src2, dst, kernelName, kernelString);
+
+    CV_Assert(src1.channels() == 1 && src2.channels() == 1);
+    CV_Assert(cmpOp >= CMP_EQ && cmpOp <= CMP_NE);
+
+    compare_run(src1, src2, dst, cmpOp, "arithm_compare", &arithm_compare);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -604,7 +340,7 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    if(src.oclchannels() != 3)
+    if (src.oclchannels() != 3)
         openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", gt, lt, args, -1, -1, build_options);
     else
         openCLExecuteKernel(src.clCxt, &arithm_sum_3, "arithm_op_sum_3", gt, lt, args, -1, -1, build_options);
@@ -617,23 +353,21 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
     CV_Assert(groupnum != 0);
     int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
     Context *clCxt = src.clCxt;
-    T *p = new T[dbsize];
+
+    AutoBuffer<T> _buf(dbsize);
+    T *p = (T*)_buf;
     cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
-    Scalar s;
-    s.val[0] = 0.0;
-    s.val[1] = 0.0;
-    s.val[2] = 0.0;
-    s.val[3] = 0.0;
+    Scalar s = Scalar::all(0.0);
     arithmetic_sum_buffer_run(src, dstBuffer, vlen, groupnum, type);
 
     memset(p, 0, dbsize * sizeof(T));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
-    for(int i = 0; i < dbsize;)
+    for (int i = 0; i < dbsize;)
     {
-        for(int j = 0; j < src.oclchannels(); j++, i++)
+        for (int j = 0; j < src.oclchannels(); j++, i++)
             s.val[j] += p[i];
     }
-    delete[] p;
+
     openCLFree(dstBuffer);
     return s;
 }
@@ -641,9 +375,9 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
 typedef Scalar (*sumFunc)(const oclMat &src, int type);
 Scalar cv::ocl::sum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "select device don't support double");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -658,9 +392,9 @@ Scalar cv::ocl::sum(const oclMat &src)
 
 Scalar cv::ocl::absSum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "select device don't support double");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -675,9 +409,9 @@ Scalar cv::ocl::absSum(const oclMat &src)
 
 Scalar cv::ocl::sqrSum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "select device don't support double");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -689,9 +423,11 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
     func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
     return func(src, 2);
 }
+
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////// meanStdDev //////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
 {
     CV_Assert(src.depth() <= CV_32S);
@@ -700,12 +436,14 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
     Mat m1(sz, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(0)),
         m2(sz, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(0));
     oclMat dst1(m1), dst2(m2);
+
     //arithmetic_sum_run(src, dst1,"arithm_op_sum");
     //arithmetic_sum_run(src, dst2,"arithm_op_squares_sum");
+
     m1 = (Mat)dst1;
     m2 = (Mat)dst2;
     int i = 0, *p = (int *)m1.data, *q = (int *)m2.data;
-    for(; i < channels; i++)
+    for (; i < channels; i++)
     {
         mean.val[i] = (double)p[i] / (src.cols * src.rows);
         stddev.val[i] = std::sqrt(std::max((double) q[i] / (src.cols * src.rows) - mean.val[i] * mean.val[i] , 0.));
@@ -715,6 +453,7 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////////// minMax  /////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen , int groupnum, String kernelName)
 {
     std::vector<std::pair<size_t , const void *> > args;
@@ -734,7 +473,7 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    if(!mask.empty())
+    if (!mask.empty())
     {
         int mall_cols = mask.step / (vlen * mask.elemSize1());
         int mpre_cols = (mask.offset % mask.step) / (vlen * mask.elemSize1());
@@ -757,7 +496,7 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
     std::vector<std::pair<size_t , const void *> > args;
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
     char build_options[50];
-    if(src.oclchannels() == 1)
+    if (src.oclchannels() == 1)
     {
         int cols = (src.cols - 1) / vlen + 1;
         int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
@@ -777,8 +516,6 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&moffset ));
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-        //        printf("elemnum:%d,cols:%d,invalid_cols:%d,offset:%d,minvalid_cols:%d,moffset:%d,repeat_e:%d\r\n",
-        //               elemnum,cols,invalid_cols,offset,minvalid_cols,moffset,repeat_me);
         openCLExecuteKernel(src.clCxt, &arithm_minMax_mask, kernelName, gt, lt, args, -1, -1, build_options);
     }
 }
@@ -807,18 +544,18 @@ template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal,
 
     Mat matbuf = Mat(buf);
     T *p = matbuf.ptr<T>();
-    if(minVal != NULL)
+    if (minVal != NULL)
     {
         *minVal = std::numeric_limits<double>::max();
-        for(int i = 0; i < vlen * (int)groupnum; i++)
+        for (int i = 0; i < vlen * (int)groupnum; i++)
         {
             *minVal = *minVal < p[i] ? *minVal : p[i];
         }
     }
-    if(maxVal != NULL)
+    if (maxVal != NULL)
     {
         *maxVal = -std::numeric_limits<double>::max();
-        for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
+        for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
         {
             *maxVal = *maxVal > p[i] ? *maxVal : p[i];
         }
@@ -831,12 +568,13 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
     oclMat buf;
     minMax_buf(src, minVal, maxVal, mask, buf);
 }
+
 void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
     CV_Assert(src.oclchannels() == 1);
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "select device don't support double");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double");
     }
     static minMaxFunc functab[8] =
     {
@@ -857,6 +595,7 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////// norm /////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 double cv::ocl::norm(const oclMat &src1, int normType)
 {
     return norm(src1, oclMat(src1.size(), src1.type(), Scalar::all(0)), normType);
@@ -881,7 +620,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         m = (gm2);
         p = (int *)m.data;
         r = -std::numeric_limits<double>::max();
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = std::max(r, (double)p[i]);
         }
@@ -891,7 +630,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         //arithmetic_sum_run(gm1, gm2,"arithm_op_sum");
         m = (gm2);
         p = (int *)m.data;
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = r + (double)p[i];
         }
@@ -901,14 +640,14 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         //arithmetic_sum_run(gm1, gm2,"arithm_op_squares_sum");
         m = (gm2);
         p = (int *)m.data;
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = r + (double)p[i];
         }
         r = std::sqrt(r);
         break;
     }
-    if(isRelative)
+    if (isRelative)
         r = r / norm(src2, normType);
     return r;
 }
@@ -916,11 +655,12 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// flip //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kernelName)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -945,10 +685,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kern
     int rows = divUp(dst.rows, 2);
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, rows, 1 };
 
     int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
@@ -965,11 +702,12 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kern
 
     openCLExecuteKernel(clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kernelName, bool isVertical)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -993,10 +731,7 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kern
     int rows = isVertical ?  divUp(dst.rows, 2) : dst.rows;
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, rows, 1 };
 
     int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
@@ -1009,7 +744,7 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kern
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
 
-    if(isVertical)
+    if (isVertical)
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
     else
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
@@ -1020,14 +755,15 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kern
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
 }
+
 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
 {
     dst.create(src.size(), src.type());
-    if(flipCode == 0)
+    if (flipCode == 0)
     {
         arithmetic_flip_rows_run(src, dst, "arithm_flip_rows");
     }
-    else if(flipCode > 0)
+    else if (flipCode > 0)
         arithmetic_flip_cols_run(src, dst, "arithm_flip_cols", false);
     else
         arithmetic_flip_cols_run(src, dst, "arithm_flip_rc", true);
@@ -1036,100 +772,45 @@ void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// LUT  //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-static void arithmetic_lut_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName)
-{
-    Context *clCxt = src1.clCxt;
-    int channels = src1.oclchannels();
-    int rows = src1.rows;
-    int cols = src1.cols;
-    //int step = src1.step;
-    int src_step = src1.step / src1.elemSize();
-    int dst_step = dst.step / dst.elemSize();
-    int whole_rows = src1.wholerows;
-    int whole_cols = src1.wholecols;
-    int src_offset = src1.offset / src1.elemSize();
-    int dst_offset = dst.offset / dst.elemSize();
-    int lut_offset = src2.offset / src2.elemSize();
-    int left_col = 0, right_col = 0;
-    size_t localSize[] = {16, 16, 1};
-    //cl_kernel kernel = openCLGetKernelFromSource(clCxt,&arithm_LUT,kernelName);
-    size_t globalSize[] = {(cols + localSize[0] - 1) / localSize[0] *localSize[0], (rows + localSize[1] - 1) / localSize[1] *localSize[1], 1};
-    if(channels == 1 && cols > 6)
-    {
-        left_col = 4 - (dst_offset & 3);
-        left_col &= 3;
-        dst_offset += left_col;
-        src_offset += left_col;
-        cols -= left_col;
-        right_col = cols & 3;
-        cols -= right_col;
-        globalSize[0] = (cols / 4 + localSize[0] - 1) / localSize[0] * localSize[0];
-    }
-    else if(channels == 1)
-    {
-        left_col = cols;
-        right_col = 0;
-        cols = 0;
-        globalSize[0] = 0;
-    }
-    CV_Assert(clCxt == dst.clCxt);
-    CV_Assert(src1.cols == dst.cols);
-    CV_Assert(src1.rows == dst.rows);
-    CV_Assert(src1.oclchannels() == dst.oclchannels());
-    //  CV_Assert(src1.step == dst.step);
-    std::vector<std::pair<size_t , const void *> > args;
 
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&channels ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&whole_rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&whole_cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut_offset ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-        openCLExecuteKernel(clCxt, &arithm_LUT, kernelName, globalSize, localSize, args, src1.oclchannels(), src1.depth());
-    }
-    if(channels == 1 && (left_col != 0 || right_col != 0))
-    {
-        src_offset = src1.offset;
-        dst_offset = dst.offset;
-        localSize[0] = 1;
-        localSize[1] = 256;
-        globalSize[0] = left_col + right_col;
-        globalSize[1] = (rows + localSize[1] - 1) / localSize[1] * localSize[1];
-        //kernel = openCLGetKernelFromSource(clCxt,&arithm_LUT,"LUT2");
-        args.clear();
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&left_col ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&channels ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&whole_rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut_offset ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-        openCLExecuteKernel(clCxt, &arithm_LUT, "LUT2", globalSize, localSize, args, src1.oclchannels(), src1.depth());
-    }
+static void arithmetic_lut_run(const oclMat &src, const oclMat &lut, oclMat &dst, String kernelName)
+{
+    Context *clCxt = src.clCxt;
+    int sdepth = src.depth();
+    int src_step1 = src.step1(), dst_step1 = dst.step1();
+    int src_offset1 = src.offset / src.elemSize1(), dst_offset1 = dst.offset / dst.elemSize1();
+    int lut_offset1 = lut.offset / lut.elemSize1() + (sdepth == CV_8U ? 0 : 128) * lut.channels();
+    int cols1 = src.cols * src.oclchannels();
+
+    size_t localSize[] = { 16, 16, 1 };
+    size_t globalSize[] = { lut.channels() == 1 ? cols1 : src.cols, src.rows, 1 };
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D srcT=%s -D dstT=%s", typeMap[sdepth], typeMap[dst.depth()]);
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut_offset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+
+    openCLExecuteKernel(clCxt, &arithm_LUT, kernelName, globalSize, localSize,
+                        args, lut.oclchannels(), -1, buildOptions.c_str());
 }
 
 void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 {
-    int cn = src.channels();
-    CV_Assert(src.depth() == CV_8U);
-    CV_Assert((lut.oclchannels() == 1 || lut.oclchannels() == cn) && lut.rows == 1 && lut.cols == 256);
+    int cn = src.channels(), depth = src.depth();
+    CV_Assert(depth == CV_8U || depth == CV_8S);
+    CV_Assert(lut.channels() == 1 || lut.channels() == src.channels());
+    CV_Assert(lut.rows == 1 && lut.cols == 256);
     dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
-    //oclMat _lut(lut);
     String kernelName = "LUT";
     arithmetic_lut_run(src, lut, dst, kernelName);
 }
@@ -1137,42 +818,44 @@ void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 //////////////////////////////// exp log /////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, String kernelName, const char **kernelString)
 {
-    dst.create(src.size(), src.type());
-    CV_Assert(src.cols == dst.cols &&
-              src.rows == dst.rows );
-
-    CV_Assert(src.type() == dst.type());
-    CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
-
     Context  *clCxt = src.clCxt;
-    if(!clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
-    //int channels = dst.oclchannels();
-    int depth = dst.depth();
+
+    CV_Assert( src.depth() == CV_32F || src.depth() == CV_64F);
+    dst.create(src.size(), src.type());
+
+    int ddepth = dst.depth();
+    int cols1 = src.cols * src.oclchannels();
+    int srcoffset1 = src.offset / src.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
+    int srcstep1 = src.step1(), dststep1 = dst.step1();
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(dst.cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+
+    std::string buildOptions = format("-D srcT=%s",
+                                      ddepth == CV_32F ? "float" : "double");
 
     std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
 
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads,
+                        args, src.oclchannels(), -1, buildOptions.c_str());
 }
+
 void cv::ocl::exp(const oclMat &src, oclMat &dst)
 {
     arithmetic_exp_log_run(src, dst, "arithm_exp", &arithm_exp);
@@ -1186,11 +869,12 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////// magnitude phase ///////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1201,13 +885,9 @@ static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src
     size_t vector_length = 1;
     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-    int rows = dst.rows;
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, dst.rows, 1 };
 
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
@@ -1236,9 +916,9 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
 
 static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1252,13 +932,9 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat
     size_t vector_length = 1;
     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-    int rows = dst.rows;
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, dst.rows, 1 };
 
     int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
@@ -1277,32 +953,28 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angleInDegrees)
 {
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
     Angle.create(x.size(), x.type());
     String kernelName = angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians";
-    if(angleInDegrees)
-    {
+    if (angleInDegrees)
         arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
-        //std::cout<<"1"<<std::endl;
-    }
     else
-    {
         arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
-        //std::cout<<"2"<<std::endl;
-    }
 }
 
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// cartToPolar ///////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                 String kernelName, bool angleInDegrees)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1311,13 +983,9 @@ static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, o
     int depth = src1.depth();
 
     int cols = src1.cols * channels;
-    int rows = src1.rows;
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, src1.rows, 1 };
 
     int tmp = angleInDegrees ? 1 : 0;
     std::vector<std::pair<size_t , const void *> > args;
@@ -1333,12 +1001,13 @@ static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, o
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_cart.data ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart.step ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&tmp ));
 
     openCLExecuteKernel(clCxt, &arithm_cartToPolar, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat &angle, bool angleInDegrees)
 {
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
@@ -1352,12 +1021,13 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// polarToCart ///////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                         String kernelName)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1369,14 +1039,11 @@ static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &d
     int rows = src2.rows;
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, rows, 1 };
 
     int tmp = angleInDegrees ? 1 : 0;
     std::vector<std::pair<size_t , const void *> > args;
-    if(src1.data)
+    if (src1.data)
     {
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
@@ -1405,7 +1072,7 @@ void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &
     x.create(angle.size(), angle.type());
     y.create(angle.size(), angle.type());
 
-    if( magnitude.data )
+    if ( magnitude.data )
     {
         CV_Assert( magnitude.size() == angle.size() && magnitude.type() == angle.type() );
         arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart_mag");
@@ -1417,6 +1084,7 @@ void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////// minMaxLoc ////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_minMaxLoc_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum)
 {
     std::vector<std::pair<size_t , const void *> > args;
@@ -1446,7 +1114,7 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
     std::vector<std::pair<size_t , const void *> > args;
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
     char build_options[50];
-    if(src.oclchannels() == 1)
+    if (src.oclchannels() == 1)
     {
         int cols = (src.cols - 1) / vlen + 1;
         int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
@@ -1466,12 +1134,12 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&moffset ));
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-        //    printf("elemnum:%d,cols:%d,invalid_cols:%d,offset:%d,minvalid_cols:%d,moffset:%d,repeat_e:%d\r\n",
-        //           elemnum,cols,invalid_cols,offset,minvalid_cols,moffset,repeat_me);
+
         openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options);
     }
 }
-template<typename T>
+
+template <typename T>
 void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                           Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
@@ -1483,23 +1151,23 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     Context *clCxt = src.clCxt;
     cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
     *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
+
     if (mask.empty())
-    {
         arithmetic_minMaxLoc_run(src, dstBuffer, vlen, groupnum);
-    }
     else
-    {
         arithmetic_minMaxLoc_mask_run(src, mask, dstBuffer, vlen, groupnum);
-    }
-    T *p = new T[groupnum * vlen * 4];
+
+    AutoBuffer<T> _buf(groupnum * vlen * 4);
+    T *p = (T*)_buf;
     memset(p, 0, dbsize);
+
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    for(int i = 0; i < vlen * (int)groupnum; i++)
+    for (int i = 0; i < vlen * (int)groupnum; i++)
     {
         *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
         minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
     }
-    for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
+    for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
     {
         *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
         maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
@@ -1508,9 +1176,9 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     int pre_rows = src.offset / src.step;
     int pre_cols = (src.offset % src.step) / src.elemSize1();
     int wholecols = src.step / src.elemSize1();
-    if( minLoc )
+    if ( minLoc )
     {
-        if( minloc >= 0 )
+        if ( minloc >= 0 )
         {
             minLoc->y = minloc / wholecols - pre_rows;
             minLoc->x = minloc % wholecols - pre_cols;
@@ -1518,9 +1186,9 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
         else
             minLoc->x = minLoc->y = -1;
     }
-    if( maxLoc )
+    if ( maxLoc )
     {
-        if( maxloc >= 0 )
+        if ( maxloc >= 0 )
         {
             maxLoc->y = maxloc / wholecols - pre_rows;
             maxLoc->x = maxloc % wholecols - pre_cols;
@@ -1528,19 +1196,22 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
         else
             maxLoc->x = maxLoc->y = -1;
     }
-    delete[] p;
+
     openCLSafeCall(clReleaseMemObject(dstBuffer));
 }
 
 typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
                               Point *minLoc, Point *maxLoc, const oclMat &mask);
+
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                         Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "select device don't support double");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double");
+        return;
     }
+
     static minMaxLocFunc functab[2] =
     {
         arithmetic_minMaxLoc<float>,
@@ -1555,6 +1226,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
 //////////////////////////////////////////////////////////////////////////////
 ///////////////////////////// countNonZero ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum, String kernelName)
 {
     std::vector<std::pair<size_t , const void *> > args;
@@ -1584,27 +1256,25 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
 int cv::ocl::countNonZero(const oclMat &src)
 {
     size_t groupnum = src.clCxt->computeUnits();
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "select device don't support double");
+        CV_Error(Error::GpuNotSupported, "selected device doesn't support double");
     }
     CV_Assert(groupnum != 0);
-    groupnum = groupnum * 2;
     int vlen = 8 , dbsize = groupnum * vlen;
-    //cl_ulong start, end;
     Context *clCxt = src.clCxt;
     String kernelName = "arithm_op_nonzero";
-    int *p = new int[dbsize], nonzero = 0;
+
+    AutoBuffer<int> _buf(dbsize);
+    int *p = (int*)_buf, nonzero = 0;
     cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(int));
     arithmetic_countNonZero_run(src, dstBuffer, vlen, groupnum, kernelName);
 
     memset(p, 0, dbsize * sizeof(int));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
-    for(int i = 0; i < dbsize; i++)
-    {
+    for (int i = 0; i < dbsize; i++)
         nonzero += p[i];
-    }
-    delete[] p;
+
     openCLSafeCall(clReleaseMemObject(dstBuffer));
     return nonzero;
 }
@@ -1612,6 +1282,7 @@ int cv::ocl::countNonZero(const oclMat &src)
 //////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////bitwise_op////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 static void bitwise_run(const oclMat &src1, oclMat &dst, String kernelName, const char **kernelString)
 {
     dst.create(src1.size(), src1.type());
@@ -1632,10 +1303,7 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, String kernelName, cons
     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, dst.rows, 1 };
 
     int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
@@ -1652,236 +1320,90 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, String kernelName, cons
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 
+enum { AND = 0, OR, XOR };
 
-template<typename T>
-void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
- const char **kernelString, void *_scalar, const char* _opt = NULL)
+static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask,
+                               oclMat &dst, int operationType)
 {
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows);
-
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-
     Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    {
+        std::cout << "Selected device does not support double" << std::endl;
+        return;
+    }
 
-    int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1}
-    };
+    CV_Assert(operationType >= AND && operationType <= XOR);
+    CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
+    CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
 
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+    dst.create(src1.size(), src1.type());
 
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    int elemSize = dst.elemSize();
+    int cols1 = dst.cols * elemSize;
+    oclMat m;
+
+    const char operationMap[] = { '&', '|', '^' };
+    std::string kernelName("arithm_bitwise_binary");
+    std::string buildOptions = format("-D Operation=%c", operationMap[operationType]);
+
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { cols1, dst.rows, 1 };
 
-    int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
+
+    if (src2.empty())
+    {
+        m.create(1, 1, dst.type());
+        m.setTo(src3);
+
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&m.data ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemSize ) );
+
+        kernelName += "_scalar";
+    }
+    else
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
+    }
+
+    if (!mask.empty())
+    {
+        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.step ));
+        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.offset ));
+
+        if (!src2.empty())
+            args.push_back( std::make_pair( sizeof(cl_int), (void *)&elemSize ));
+
+        kernelName += "_mask";
+    }
+
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    T scalar;
-    if(_scalar != NULL)
-    {
-        double scalar1 = *((double *)_scalar);
-        scalar = (T)scalar1;
-        args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
-    }
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, _opt);
-}
-static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
- String kernelName, const char **kernelString, const char* _opt = NULL)
-{
-    bitwise_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, _opt);
-}
-static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
- const oclMat &mask, String kernelName, const char **kernelString, const char* _opt = NULL)
-{
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-              src1.rows == src2.rows && src2.rows == dst.rows &&
-              src1.rows == mask.rows && src1.cols == mask.cols);
-
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-    CV_Assert(mask.type() == CV_8U);
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
-        {2, 2, 1, 1, 1, 1, 1},
-        {4, 4, 2, 2 , 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, _opt);
-}
-
-
-template <typename WT , typename CL_WT>
-void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt = NULL)
-{
-    dst.create(src1.size(), src1.type());
-
-    CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows &&
-              src1.type() == dst.type());
-
-
-    if(mask.data)
-    {
-        CV_Assert(mask.type() == CV_8U && src1.rows == mask.rows && src1.cols == mask.cols);
-    }
-
-    Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    WT s[4] = { saturate_cast<WT>(src2.val[0]), saturate_cast<WT>(src2.val[1]),
-                saturate_cast<WT>(src2.val[2]), saturate_cast<WT>(src2.val[3])
-              };
-
-    int vector_lengths[4][7] = {{4, 4, 2, 2, 1, 1, 1},
-        {2, 2, 1, 1, 1, 1, 1},
-        {4, 4, 2, 2 , 1, 1, 1},
-        {1, 1, 1, 1, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(dst.cols + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src1.offset));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset));
-
-    if(mask.data)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset));
-    }
-    args.push_back( std::make_pair( sizeof(CL_WT) , (void *)&s ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step1 ));
-    if(isMatSubScalar != 0)
-    {
-        isMatSubScalar = isMatSubScalar > 0 ? 1 : 0;
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&isMatSubScalar));
-    }
-
-    openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, opt);
-}
-
-
-typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt);
-
-
-static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt)
-{
-    static BitwiseFuncS tab[8] =
-    {
-#if 0
-        bitwise_scalar_run<unsigned char>,
-        bitwise_scalar_run<char>,
-        bitwise_scalar_run<unsigned short>,
-        bitwise_scalar_run<short>,
-        bitwise_scalar_run<int>,
-        bitwise_scalar_run<float>,
-        bitwise_scalar_run<double>,
-        0
-#else
-
-        bitwise_scalar_run<unsigned char, cl_uchar4>,
-        bitwise_scalar_run<char, cl_char4>,
-        bitwise_scalar_run<unsigned short, cl_ushort4>,
-        bitwise_scalar_run<short, cl_short4>,
-        bitwise_scalar_run<int, cl_int4>,
-        bitwise_scalar_run<float, cl_float4>,
-        bitwise_scalar_run<double, cl_double4>,
-        0
-#endif
-    };
-    BitwiseFuncS func = tab[src1.depth()];
-    if(func == 0)
-        cv::error(Error::StsBadArg, "Unsupported arithmetic operation", "", __FILE__, __LINE__);
-    func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar, opt);
-}
-static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
- const oclMat &mask, String kernelName, const char **kernelString, const char * opt = NULL)
-{
-    bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0, opt);
+    openCLExecuteKernel(clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) :
+                                              (!src2.empty() ? &arithm_bitwise_binary_mask : &arithm_bitwise_binary_scalar_mask),
+                        kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        std::cout << "Selected device do not support double" << std::endl;
+        std::cout << "Selected device does not support double" << std::endl;
         return;
     }
+
     dst.create(src.size(), src.type());
     String kernelName =  "arithm_bitwise_not";
     bitwise_run(src, dst, kernelName, &arithm_bitwise_not);
@@ -1889,103 +1411,32 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 
 void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    // dst.create(src1.size(),src1.type());
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        std::cout << "Selected device do not support double" << std::endl;
-        return;
-    }
-
-    String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
-    static const char opt [] = "-D OP_BINARY=|";
-    if (mask.empty())
-        bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
-    else
-        bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
+    bitwise_binary_run(src1, src2, Scalar(), mask, dst, OR);
 }
 
-
 void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        std::cout << "Selected device do not support double" << std::endl;
-        return;
-    }
-    static const char opt [] = "-D OP_BINARY=|";
-    String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
-    if (mask.data)
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
-    else
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
+    bitwise_binary_run(src1, oclMat(), src2, mask, dst, OR);
 }
 
 void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    //    dst.create(src1.size(),src1.type());
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        std::cout << "Selected device do not support double" << std::endl;
-        return;
-    }
-    oclMat emptyMat;
-
-    String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
-
-    static const char opt [] = "-D OP_BINARY=&";
-    if (mask.empty())
-        bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
-    else
-        bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
+    bitwise_binary_run(src1, src2, Scalar(), mask, dst, AND);
 }
 
 void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        std::cout << "Selected device do not support double" << std::endl;
-        return;
-    }
-    static const char opt [] = "-D OP_BINARY=&";
-    String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
-    if (mask.data)
-        bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
-    else
-        bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
+    bitwise_binary_run(src1, oclMat(), src2, mask, dst, AND);
 }
 
 void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        std::cout << "Selected device do not support double" << std::endl;
-        return;
-    }
-    String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
-
-    static const char opt [] = "-D OP_BINARY=^";
-
-    if (mask.empty())
-        bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
-    else
-        bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
+    bitwise_binary_run(src1, src2, Scalar(), mask, dst, XOR);
 }
 
-
 void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        std::cout << "Selected device do not support double" << std::endl;
-        return;
-    }
-    String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
-    static const char opt [] = "-D OP_BINARY=^";
-    if (mask.data)
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
-    else
-        bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
+    bitwise_binary_run(src1, oclMat(), src2, mask, dst, XOR);
 }
 
 oclMat cv::ocl::operator ~ (const oclMat &src)
@@ -2069,60 +1520,51 @@ oclMatExpr::operator oclMat() const
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////// transpose ////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
+
 #define TILE_DIM      (32)
 #define BLOCK_ROWS    (256/TILE_DIM)
-static void transpose_run(const oclMat &src, oclMat &dst, String kernelName)
+
+static void transpose_run(const oclMat &src, oclMat &dst, String kernelName, bool inplace = false)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    Context  *clCxt = src.clCxt;
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(Error::GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
-    CV_Assert(src.cols == dst.rows && src.rows == dst.cols);
-
-    Context  *clCxt = src.clCxt;
-    int channels = src.oclchannels();
-    int depth = src.depth();
-
-    int vector_lengths[4][7] = {{1, 0, 0, 0, 1, 1, 0},
-        {0, 0, 1, 1, 0, 0, 0},
-        {0, 0, 0, 0 , 0, 0, 0},
-        {1, 1, 0, 0, 0, 0, 0}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
-    int cols = divUp(src.cols + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char channelsString[] = { ' ', ' ', '2', '4', '4' };
+    std::string buildOptions = format("-D T=%s%c", typeMap[src.depth()],
+                                      channelsString[src.channels()]);
 
     size_t localThreads[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalThreads[3] = { divUp(cols, TILE_DIM) *localThreads[0],
-                                divUp(src.rows, TILE_DIM) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
+
+    int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
+    int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
 
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
 
-    openCLExecuteKernel(clCxt, &arithm_transpose, kernelName, globalThreads, localThreads, args, channels, depth);
+    openCLExecuteKernel(clCxt, &arithm_transpose, kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 void cv::ocl::transpose(const oclMat &src, oclMat &dst)
 {
-    CV_Assert(src.type() == CV_8UC1  || src.type() == CV_8UC3 || src.type() == CV_8UC4  || src.type() == CV_8SC3  || src.type() == CV_8SC4  ||
-              src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
+    CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
 
-    oclMat emptyMat;
-
-    if( src.data == dst.data && dst.cols == dst.rows )
-        transpose_run( src, emptyMat, "transposeI_");
+    if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
+         && dst.size() == src.size())
+        transpose_run( src, dst, "transpose_inplace", true);
     else
     {
         dst.create(src.cols, src.rows, src.type());
@@ -2130,163 +1572,77 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
     }
 }
 
+//////////////////////////////////////////////////////////////////////////////
+////////////////////////////// addWeighted ///////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
 {
-    dst.create(src1.size(), src1.type());
-    CV_Assert(src1.cols ==  src2.cols && src2.cols == dst.cols &&
-              src1.rows ==  src2.rows && src2.rows == dst.rows);
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-
     Context *clCxt = src1.clCxt;
+    bool hasDouble = clCxt->supportsFeature(Context::CL_DOUBLE);
+    if (!hasDouble && src1.depth() == CV_64F)
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
+        return;
+    }
+
+    CV_Assert(src1.size() ==  src2.size() && src1.type() == src2.type());
+    dst.create(src1.size(), src1.type());
+
     int channels = dst.oclchannels();
     int depth = dst.depth();
 
+    int cols1 = src1.cols * channels;
+    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
+    int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
+    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
 
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
+                                      typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
+                                      depth >= CV_32F ? "" : "_sat_rte");
 
     size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols1, dst.rows, 1};
+
+    float alpha_f = static_cast<float>(alpha),
+            beta_f = static_cast<float>(beta),
+            gama_f = static_cast<float>(gama);
 
-    int dst_step1 = dst.cols * dst.elemSize();
-    int src1_step = (int) src1.step;
-    int src2_step = (int) src2.step;
-    int dst_step  = (int) dst.step;
-    float alpha_f = alpha, beta_f = beta, gama_f = gama;
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1));
 
-    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&alpha ));
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&beta ));
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&gama ));
-    }
-    else
+    if (!hasDouble)
     {
         args.push_back( std::make_pair( sizeof(cl_float), (void *)&alpha_f ));
         args.push_back( std::make_pair( sizeof(cl_float), (void *)&beta_f ));
         args.push_back( std::make_pair( sizeof(cl_float), (void *)&gama_f ));
     }
+    else
+    {
+        args.push_back( std::make_pair( sizeof(cl_double), (void *)&alpha ));
+        args.push_back( std::make_pair( sizeof(cl_double), (void *)&beta ));
+        args.push_back( std::make_pair( sizeof(cl_double), (void *)&gama ));
+    }
 
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
-void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
-              (src1.depth() == CV_32F ));
-
-    dst.create(src1.size(), src1.type());
-
-
-    Context *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(clCxt, &arithm_magnitudeSqr, "magnitudeSqr", globalThreads, localThreads, args, 1, depth);
-}
-
-void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst)
-{
-    CV_Assert (src1.depth() == CV_32F );
-    CV_Assert(src1.size() == dst.size());
-
-    dst.create(src1.size(), CV_32FC1);
-
-
-    Context *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-
-    int vector_lengths[4][7] = {{4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4},
-        {4, 0, 4, 4, 4, 4, 4}
-    };
-
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(dst.rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(clCxt, &arithm_magnitudeSqr, "magnitudeSqr", globalThreads, localThreads, args, 2, depth);
-}
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Pow //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
 
 static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String kernelName, const char **kernelString)
 {
@@ -2303,10 +1659,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String
     int rows = dst.rows;
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, rows, 1 };
 
     int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
@@ -2319,19 +1672,19 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    float pf = p;
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
+
+    float pf = static_cast<float>(p);
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
         args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
-    }
     else
         args.push_back( std::make_pair( sizeof(cl_double), (void *)&p ));
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
+
 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
-    if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
+    if (!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -2343,3 +1696,68 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 
     arithmetic_pow_run(x, p, y, kernelName, &arithm_pow);
 }
+
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// setIdentity //////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+
+void cv::ocl::setIdentity(oclMat& src, double scalar)
+{
+    CV_Assert(src.empty() == false && src.rows == src.cols);
+    CV_Assert(src.type() == CV_32SC1 || src.type() == CV_32FC1);
+    int src_step = src.step/src.elemSize();
+    Context  *clCxt = Context::getContext();
+    size_t local_threads[] = {16, 16, 1};
+    size_t global_threads[] = {src.cols, src.rows, 1};
+
+    String kernelName = "setIdentityKernel";
+    if (src.type() == CV_32FC1)
+        kernelName = kernelName + "_F1";
+    else if (src.type() == CV_32SC1)
+        kernelName = kernelName + "_I1";
+    else
+    {
+        kernelName = kernelName + "_D1";
+        if (!(clCxt->supportsFeature(Context::CL_DOUBLE)))
+        {
+            oclMat temp;
+            src.convertTo(temp, CV_32FC1);
+            temp.copyTo(src);
+        }
+
+    }
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
+
+    int scalar_i = 0;
+    float scalar_f = 0.0f;
+    if (clCxt->supportsFeature(Context::CL_DOUBLE))
+    {
+        if (src.type() == CV_32SC1)
+        {
+            scalar_i = (int)scalar;
+            args.push_back(std::make_pair(sizeof(cl_int), (void*)&scalar_i));
+        }
+        else
+            args.push_back(std::make_pair(sizeof(cl_double), (void*)&scalar));
+    }
+    else
+    {
+        if (src.type() == CV_32SC1)
+        {
+            scalar_i = (int)scalar;
+            args.push_back(std::make_pair(sizeof(cl_int), (void*)&scalar_i));
+        }
+        else
+        {
+            scalar_f = (float)scalar;
+            args.push_back(std::make_pair(sizeof(cl_float), (void*)&scalar_f));
+        }
+    }
+
+    openCLExecuteKernel(clCxt, &arithm_setidentity, kernelName, global_threads, local_threads, args, -1, -1);
+}
diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp
index abe98cf59..fa7a0ba8a 100644
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -357,14 +357,13 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
     std::vector< std::pair<size_t, const void *> > args;
     size_t localThreads[3]  = {128, 1, 1};
 
-#define DIVUP(a, b) ((a)+(b)-1)/(b)
     int count_i[1] = {0};
     while(count > 0)
     {
         openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
 
         args.clear();
-        size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
+        size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
@@ -379,7 +378,6 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
         openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
         std::swap(st1, st2);
     }
-#undef DIVUP
 }
 
 void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index bbbdf8bc1..a30b9d9c2 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -67,22 +67,12 @@ extern const char *filtering_adaptive_bilateral;
 }
 }
 
-namespace
-{
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-}
-
 namespace
 {
 inline void normalizeAnchor(int &anchor, int ksize)
 {
     if (anchor < 0)
-    {
         anchor = ksize >> 1;
-    }
 
     CV_Assert(0 <= anchor && anchor < ksize);
 }
@@ -96,9 +86,7 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
 inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size)
 {
     if (roi == Rect(0, 0, -1, -1))
-    {
         roi = Rect(0, 0, src_size.width, src_size.height);
-    }
 
     CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
     CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
@@ -111,10 +99,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
     int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
 
     if (nDivisor)
-    {
         *nDivisor = scale;
-    }
-
     Mat temp(kernel.size(), type);
     kernel.convertTo(temp, type, scale);
     Mat cont_krnl = temp.reshape(1, 1);
@@ -124,9 +109,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
         int count = cont_krnl.cols >> 1;
 
         for (int i = 0; i < count; ++i)
-        {
             std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
-        }
     }
 
     gpu_krnl.upload(cont_krnl);
@@ -146,7 +129,7 @@ public:
     {
         Size src_size = src.size();
 
-        // Delete those two clause below which exist before, However, the result is alos correct
+        // Delete those two clause below which exist before, However, the result is also correct
         // dst.create(src_size, src.type());
         // dst = Scalar(0.0);
 
@@ -411,23 +394,8 @@ public:
     {
         Filter2DEngine_GPU::apply(src, dst);
 
-        //if (iters > 1)
-        //{
-        // Size wholesize;
-        // Point ofs;
-        // dst.locateROI(wholesize,ofs);
-        // int rows = dst.rows, cols = dst.cols;
-        // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols);
-        // dst.copyTo(morfBuf);
-        // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // //morfBuf.create(src.size(),src.type());
-        // //Filter2DEngine_GPU::apply(dst, morfBuf);
-        // //morfBuf.copyTo(dst);
-        //}
         for (int i = 1; i < iters; ++i)
         {
-            //dst.swap(morfBuf);
             Size wholesize;
             Point ofs;
             dst.locateROI(wholesize, ofs);
@@ -627,8 +595,6 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel
     int localWidth = localThreads[0] + paddingPixels;
     int localHeight = localThreads[1] + paddingPixels;
 
-    // 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
-    // 6   = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
     size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
 
     int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
@@ -739,24 +705,16 @@ public:
     virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
     {
         Size src_size = src.size();
-        //int src_type = src.type();
 
         int cn = src.oclchannels();
-        //dst.create(src_size, src_type);
-        //dst = Scalar(0.0);
-        //dstBuf.create(src_size, src_type);
         dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-        //dstBuf = Scalar(0.0);
 
         normalizeROI(roi, ksize, anchor, src_size);
 
         srcROI = src(roi);
         dstROI = dst(roi);
-        //dstBufROI = dstBuf(roi);
 
         (*rowFilter)(srcROI, dstBuf);
-        //Mat rm(dstBufROI);
-        //std::cout << "rm " << rm << endl;
         (*columnFilter)(dstBuf, dstROI);
     }
 
@@ -1343,11 +1301,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
     CV_Assert(src.oclchannels() == dst.oclchannels());
     CV_Assert(ksize == (anchor << 1) + 1);
     int src_pix_per_row, dst_pix_per_row;
-    //int src_offset_x, src_offset_y;
     int dst_offset_in_pixel;
     src_pix_per_row = src.step / src.elemSize();
-    //src_offset_x = (src.offset % src.step) / src.elemSize();
-    //src_offset_y = src.offset / src.step;
     dst_pix_per_row = dst.step / dst.elemSize();
     dst_offset_in_pixel = dst.offset / dst.elemSize();
 
@@ -1359,8 +1314,6 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    //args.push_back(std::make_pair(sizeof(cl_int),(void*)&src_offset_x));
-    //args.push_back(std::make_pair(sizeof(cl_int),(void*)&src_offset_y));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
@@ -1379,23 +1332,11 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
         linearColumnFilter_gpu<int>,
         linearColumnFilter_gpu<float>
     };
-    /*
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 ||
-    dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 ||
-    bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);
 
-    Mat temp(columnKernel.size(), CV_32SC1);
-    columnKernel.convertTo(temp, CV_32SC1);
-    Mat cont_krnl = temp.reshape(1, 1);
-    */
     Mat temp = columnKernel.reshape(1, 1);
     oclMat mat_kernel(temp);
 
     int ksize = temp.cols;
-
-    //CV_Assert(ksize < 16);
-
     normalizeAnchor(anchor, ksize);
 
     return makePtr<GpuLinearColumnFilter>(ksize, anchor, mat_kernel,
@@ -1433,11 +1374,8 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
     }
 
     if (ddepth < 0)
-    {
         ddepth = src.depth();
-    }
 
-    //CV_Assert(ddepth == src.depth());
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
     Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
@@ -1464,19 +1402,11 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
         // usually the smoothing part is the slowest to compute,
         // so try to scale it instead of the faster differenciating part
         if (dx == 0)
-        {
             kx *= scale;
-        }
         else
-        {
             ky *= scale;
-        }
     }
 
-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
     sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
 }
 
@@ -1490,19 +1420,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
         // usually the smoothing part is the slowest to compute,
         // so try to scale it instead of the faster differenciating part
         if (dx == 0)
-        {
             kx *= scale;
-        }
         else
-        {
             ky *= scale;
-        }
     }
 
-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
     sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
 }
 
@@ -1524,9 +1446,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
     Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);
 
     if (scale != 1)
-    {
         kernel *= scale;
-    }
 
     filter2D(src, dst, ddepth, kernel, Point(-1, -1));
 }
@@ -1545,14 +1465,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
 
     // automatic detection of kernel size from sigma
     if (ksize.width <= 0 && sigma1 > 0)
-    {
         ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }
 
     if (ksize.height <= 0 && sigma2 > 0)
-    {
         ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }
 
     CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);
 
@@ -1563,17 +1479,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
     Mat ky;
 
     if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-    {
         ky = kx;
-    }
     else
-    {
         ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-    }
 
-    //Mat kx_, ky_;
-    //kx.convertTo(kx_,CV_32S,1<<8);
-    //ky.convertTo(ky_,CV_32S,1<<8);
     return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
 }
 
@@ -1604,14 +1513,10 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
     if (bordertype != BORDER_CONSTANT)
     {
         if (src.rows == 1)
-        {
             ksize.height = 1;
-        }
 
         if (src.cols == 1)
-        {
             ksize.width = 1;
-        }
     }
 
     Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
@@ -1637,6 +1542,7 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize
     {
         lut.at<float>(idx++) = sigma2 / (sigma2 + x * x + y * y);
     }
+
     oclMat dlut(lut);
     int depth = src.depth();
     int cn = src.oclchannels();
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index ac113d27b..9eaa79797 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -124,11 +124,6 @@ namespace cv
 
 using namespace ::cv::ocl::device;
 
-static inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
                                       Size cell_size_, int nbins_, double win_sigma_,
                                       double threshold_L2hys_, bool gamma_correction_, int nlevels_)
@@ -1671,7 +1666,8 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     {
         openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
             localThreads, args, -1, -1, "-D CPU");
-    }else
+    }
+    else
     {
         cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
         int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 0dc7fe9ce..49fd8c509 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -245,9 +245,6 @@ namespace cv
                     kernelName = "remapNNF1Constant";
             }
 
-            //int channels = dst.oclchannels();
-            //int depth = dst.depth();
-            //int type = src.type();
             size_t blkSizeX = 16, blkSizeY = 16;
             size_t glbSizeX;
             int cols = dst.cols;
@@ -501,21 +498,13 @@ namespace cv
                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
             }
             else
-            {
                 CV_Error(Error::StsUnsupportedFormat, "Non-supported filter length");
-                //String kernelName = "medianFilter";
-                //args.push_back( std::make_pair( sizeof(cl_int),(void*)&m));
-
-                //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
-            }
-
         }
 
         ////////////////////////////////////////////////////////////////////////
         // copyMakeBorder
         void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
         {
-            //CV_Assert(src.oclchannels() != 2);
             CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
             if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
             {
@@ -531,10 +520,12 @@ namespace cv
             {
                 CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
             }
+
             if(bordertype == cv::BORDER_REFLECT_101)
             {
                 CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
             }
+
             dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
             int srcStep = src.step1() / src.oclchannels();
             int dstStep = dst.step1() / dst.oclchannels();
@@ -734,19 +725,6 @@ namespace cv
             }
 
             openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-            //uchar* cputemp=new uchar[32*dst.wholerows];
-            ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
-            //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
-            //						0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
-            //for(int i=0;i<dst.wholerows;i++)
-            //{
-            //	for(int j=0;j<dst.wholecols;j++)
-            //	{
-            //		std::cout<< (int)cputemp[i*32+j]<<" ";
-            //	}
-            //	std::cout<<std::endl;
-            //}
-            //delete []cputemp;
         }
 
         ////////////////////////////////////////////////////////////////////////
@@ -1512,11 +1490,6 @@ namespace cv
         // CLAHE
         namespace clahe
         {
-            inline int divUp(int total, int grain)
-            {
-                return (total + grain - 1) / grain * grain;
-            }
-
             static void calcLut(const oclMat &src, oclMat &dst,
                 const int tilesX, const int tilesY, const cv::Size tileSize,
                 const int clipLimit, const float lutScale)
@@ -1540,9 +1513,7 @@ namespace cv
                 size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
                 bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
                 if (is_cpu)
-                {
                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
-                }
                 else
                 {
                     cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
@@ -1577,7 +1548,7 @@ namespace cv
 
                 String kernelName = "transform";
                 size_t localThreads[3]  = { 32, 8, 1 };
-                size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 };
+                size_t globalThreads[3] = { src.cols, src.rows, 1 };
 
                 openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
             }
@@ -1819,11 +1790,6 @@ void cv::ocl::mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int /*fl
     openCLExecuteKernel(clCxt, &imgproc_mulAndScaleSpectrums, kernelName, gt, lt, args, -1, -1);
 }
 //////////////////////////////////convolve////////////////////////////////////////////////////
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 // ported from CUDA module
 void cv::ocl::ConvolveBuf::create(Size image_size, Size templ_size)
 {
@@ -1938,6 +1904,7 @@ static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &r
 #undef UNUSED
 #endif
 }
+
 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, String kernelName, const char **kernelString)
 {
     CV_Assert(src.depth() == CV_32FC1);
@@ -1959,10 +1926,7 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, St
     int rows = dst.rows;
 
     size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                divUp(rows, localThreads[1]) *localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, rows, 1 };
 
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index 827b7d495..00158ac37 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -282,11 +282,6 @@ namespace cv
             return 0;
         }
 
-        inline int divUp(int total, int grain)
-        {
-            return (total + grain - 1) / grain;
-        }
-
         int getDevice(std::vector<Info> &oclinfo, int devicetype)
         {
             //TODO: cache oclinfo vector
@@ -687,6 +682,16 @@ namespace cv
             CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
         }
 
+        static inline size_t roundUp(size_t sz, size_t n)
+        {
+            // we don't assume that n is a power of 2 (see alignSize)
+            // equal to divUp(sz, n) * n
+            size_t t = sz + n - 1;
+            size_t rem = t % n;
+            size_t result = t - rem;
+            return result;
+        }
+
 #ifdef PRINT_KERNEL_RUN_TIME
         static double total_execute_time = 0;
         static double total_kernel_time = 0;
@@ -710,11 +715,10 @@ namespace cv
 
             if ( localThreads != NULL)
             {
-                globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
-                globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
-                globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+                globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
+                globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
+                globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
 
-                //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
                 cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
             }
             for(size_t i = 0; i < args.size(); i ++)
@@ -745,10 +749,6 @@ namespace cv
             execute_time = (double)(end_time - start_time) / (1000 * 1000);
             total_time = (double)(end_time - queue_time) / (1000 * 1000);
 
-            //	std::cout << setiosflags(ios::left) << setw(15) << execute_time;
-            //	std::cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
-            //	std::cout << setiosflags(ios::left) << setw(15) << total_time << std::endl;
-
             total_execute_time += execute_time;
             total_kernel_time += total_time;
             clReleaseEvent(event);
@@ -1016,7 +1016,7 @@ namespace cv
             programCache->releaseProgram();
         }
 
-        bool Context::supportsFeature(int ftype)
+        bool Context::supportsFeature(int ftype) const
         {
             switch(ftype)
             {
@@ -1031,7 +1031,7 @@ namespace cv
             }
         }
 
-        size_t Context::computeUnits()
+        size_t Context::computeUnits() const
         {
             return impl->maxComputeUnits;
         }
@@ -1041,6 +1041,14 @@ namespace cv
             return impl->maxWorkGroupSize;
         }
 
+        unsigned long queryLocalMemInfo()
+        {
+            Info::Impl* impl = Context::getContext()->impl;
+            cl_ulong local_memory_size = 0;
+            clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void*)&local_memory_size, 0);
+            return local_memory_size;
+        }
+
         void* Context::oclContext()
         {
             return impl->oclcontext;
diff --git a/modules/ocl/src/kalman.cpp b/modules/ocl/src/kalman.cpp
new file mode 100644
index 000000000..8a5b0d4c2
--- /dev/null
+++ b/modules/ocl/src/kalman.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//     Jin Ma, jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+KalmanFilter::KalmanFilter()
+{
+
+}
+
+KalmanFilter::KalmanFilter(int dynamParams, int measureParams, int controlParams, int type)
+{
+    init(dynamParams, measureParams, controlParams, type);
+}
+
+void KalmanFilter::init(int DP, int MP, int CP, int type)
+{
+    CV_Assert( DP > 0 && MP > 0 );
+    CV_Assert( type == CV_32F || type == CV_64F );
+    CP = cv::max(CP, 0);
+
+    statePre.create(DP, 1, type);
+    statePre.setTo(Scalar::all(0));
+
+    statePost.create(DP, 1, type);
+    statePost.setTo(Scalar::all(0));
+
+    transitionMatrix.create(DP, DP, type);
+    setIdentity(transitionMatrix, 1);
+
+    processNoiseCov.create(DP, DP, type);
+    setIdentity(processNoiseCov, 1);
+
+    measurementNoiseCov.create(MP, MP, type);
+    setIdentity(measurementNoiseCov, 1);
+
+    measurementMatrix.create(MP, DP, type);
+    measurementMatrix.setTo(Scalar::all(0));
+
+    errorCovPre.create(DP, DP, type);
+    errorCovPre.setTo(Scalar::all(0));
+
+    errorCovPost.create(DP, DP, type);
+    errorCovPost.setTo(Scalar::all(0));
+
+    gain.create(DP, MP, type);
+    gain.setTo(Scalar::all(0));
+
+    if( CP > 0 )
+    {
+        controlMatrix.create(DP, CP, type);
+        controlMatrix.setTo(Scalar::all(0));
+    }
+    else
+        controlMatrix.release();
+
+    temp1.create(DP, DP, type);
+    temp2.create(MP, DP, type);
+    temp3.create(MP, MP, type);
+    temp4.create(MP, DP, type);
+    temp5.create(MP, 1, type);
+}
+
+CV_EXPORTS const oclMat& KalmanFilter::predict(const oclMat& control)
+{
+    gemm(transitionMatrix, statePost, 1, oclMat(), 0, statePre);
+    oclMat temp;
+
+    if(control.data)
+        gemm(controlMatrix, control, 1, statePre, 1, statePre);
+    gemm(transitionMatrix, errorCovPost, 1, oclMat(), 0, temp1);
+    gemm(temp1, transitionMatrix, 1, processNoiseCov, 1, errorCovPre, GEMM_2_T);
+    statePre.copyTo(statePost);
+    return statePre;
+}
+
+CV_EXPORTS const oclMat& KalmanFilter::correct(const oclMat& measurement)
+{
+    CV_Assert(measurement.empty() == false);
+    gemm(measurementMatrix, errorCovPre, 1, oclMat(), 0, temp2);
+    gemm(temp2, measurementMatrix, 1, measurementNoiseCov, 1, temp3, GEMM_2_T);
+    Mat temp;
+    solve(Mat(temp3), Mat(temp2), temp, DECOMP_SVD);
+    temp4.upload(temp);
+    gain = temp4.t();
+    gemm(measurementMatrix, statePre, -1, measurement, 1, temp5);
+    gemm(gain, temp5, 1, statePre, 1, statePost);
+    gemm(gain, temp2, -1, errorCovPre, 1, errorCovPost);
+    return statePost;
+}
\ No newline at end of file
diff --git a/modules/ocl/src/knearest.cpp b/modules/ocl/src/knearest.cpp
new file mode 100644
index 000000000..a7a122266
--- /dev/null
+++ b/modules/ocl/src/knearest.cpp
@@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma, jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+using namespace cv;
+using namespace cv::ocl;
+
+namespace cv
+{
+    namespace ocl
+    {
+        extern const char* knearest;//knearest
+    }
+}
+
+KNearestNeighbour::KNearestNeighbour()
+{
+    clear();
+}
+
+KNearestNeighbour::~KNearestNeighbour()
+{
+    clear();
+    samples_ocl.release();
+}
+
+void KNearestNeighbour::clear()
+{
+    CvKNearest::clear();
+}
+
+bool KNearestNeighbour::train(const Mat& trainData, Mat& labels, Mat& sampleIdx,
+                              bool isRegression, int _max_k, bool updateBase)
+{
+    max_k = _max_k;
+    bool cv_knn_train = CvKNearest::train(trainData, labels, sampleIdx, isRegression, max_k, updateBase);
+
+    CvVectors* s = CvKNearest::samples;
+
+    cv::Mat samples_mat(s->count, CvKNearest::var_count + 1, s->type);
+
+    float* s1 = (float*)(s + 1);
+    for(int i = 0; i < s->count; i++)
+    {
+        float* t1 = s->data.fl[i];
+        for(int j = 0; j < CvKNearest::var_count; j++)
+        {
+            Point pos(j, i);
+            samples_mat.at<float>(pos) = t1[j];
+        }
+
+        Point pos_label(CvKNearest::var_count, i);
+        samples_mat.at<float>(pos_label) = s1[i];
+    }
+
+    samples_ocl = samples_mat;
+    return cv_knn_train;
+}
+
+void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lables)
+{
+    CV_Assert(!samples_ocl.empty());
+    lables.create(samples.rows, 1, CV_32FC1);
+
+    CV_Assert(samples.cols == CvKNearest::var_count);
+    CV_Assert(samples.type() == CV_32FC1);
+    CV_Assert(k >= 1 && k <= max_k);
+
+    int k1 = KNearest::get_sample_count();
+    k1 = MIN( k1, k );
+
+    String kernel_name = "knn_find_nearest";
+    cl_ulong local_memory_size = queryLocalMemInfo();
+    int nThreads = local_memory_size / (2 * k * 4);
+    if(nThreads >= 256)
+        nThreads = 256;
+
+    int smem_size = nThreads * k * 4 * 2;
+    size_t local_thread[] = {1, nThreads, 1};
+    size_t global_thread[] = {1, samples.rows, 1};
+
+    char build_option[50];
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        sprintf(build_option, " ");
+    }else
+        sprintf(build_option, "-D DOUBLE_SUPPORT");
+
+    std::vector< std::pair<size_t, const void*> > args;
+
+    int samples_ocl_step = samples_ocl.step/samples_ocl.elemSize();
+    int samples_step = samples.step/samples.elemSize();
+    int lables_step = lables.step/lables.elemSize();
+
+    int _regression = 0;
+    if(CvKNearest::regression)
+        _regression = 1;
+
+    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&samples.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&k));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&samples_ocl.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&lables.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&lables_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&_regression));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&k1));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void*)&nThreads));
+    args.push_back(std::make_pair(smem_size, (void*)NULL));
+    openCLExecuteKernel(Context::getContext(), &knearest, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
+}
\ No newline at end of file
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index ddbd76db4..83a8808e6 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -295,11 +295,6 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
     m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
 }
 
-/////////////////////common//////////////////////////////////////
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
 ///////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// CopyTo /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////
@@ -319,11 +314,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
     char compile_option[32];
     sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
     size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-
-    globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
-    globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
-    globalThreads[2] = 1;
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 
     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
@@ -344,19 +335,14 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
                         localThreads, args, -1, -1, compile_option);
 }
 
-void cv::ocl::oclMat::copyTo( oclMat &m ) const
-{
-    CV_DbgAssert(!this->empty());
-    m.create(size(), type());
-    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
-                       data, step, cols * elemSize(), rows, offset);
-}
-
 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 {
     if (mask.empty())
     {
-        copyTo(mat);
+        CV_DbgAssert(!this->empty());
+        mat.create(size(), type());
+        openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
+                           data, step, cols * elemSize(), rows, offset);
     }
     else
     {
@@ -370,40 +356,50 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 ///////////////////////////////////////////////////////////////////////////
 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
 {
-    String kernelName = "convert_to_S";
-    std::stringstream idxStr;
-    idxStr << src.depth();
-    kernelName = kernelName + idxStr.str().c_str();
+    String kernelName = "convert_to";
     float alpha_f = alpha, beta_f = beta;
+    int sdepth = src.depth(), ddepth = dst.depth();
+    int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
+    int cols1 = src.cols * src.oclchannels();
+
+    char buildOptions[150], convertString[50];
+    const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
+    sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
+            typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
+
     CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
     std::vector<std::pair<size_t , const void *> > args;
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
-    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
-    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
-    if(dst.type() == CV_8UC1)
-    {
-        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
-    }
+
+    size_t localThreads[3] = { 16, 16, 1 };
+    size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
+                                divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
+
+    int doffset1 = dst.offset / dst.elemSize1();
+    int soffset1 = src.offset / src.elemSize1();
+
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols1 ));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sstep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&soffset1 ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstep1 ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&doffset1 ));
     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
+
     openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
-                        localThreads, args, dst.oclchannels(), dst.depth());
+                        localThreads, args, -1, -1, buildOptions);
 }
 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 {
-    //cout << "cv::ocl::oclMat::convertTo()" << endl;
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
+            (depth() == CV_64F || dst.depth() == CV_64F))
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        return;
+    }
 
     bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
                    && fabs(beta) < std::numeric_limits<double>::epsilon();
@@ -413,7 +409,6 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
     else
         rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
 
-    //int scn = channels();
     int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
     if( sdepth == ddepth && noScale )
     {
@@ -433,201 +428,62 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
 ///////////////////////////////////////////////////////////////////////////
 //////////////////////////////// setTo ////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////
+
 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
 {
-    //cout << "cv::ocl::oclMat::=" << endl;
     setTo(s);
     return *this;
 }
+
 static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, String kernelName)
 {
     std::vector<std::pair<size_t , const void *> > args;
 
     size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
-    if(dst.type() == CV_8UC1)
-    {
+
+    if (dst.type() == CV_8UC1)
         globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    }
-    char compile_option[32];
-    union sc
-    {
-        cl_uchar4 uval;
-        cl_char4  cval;
-        cl_ushort4 usval;
-        cl_short4 shval;
-        cl_int4 ival;
-        cl_float4 fval;
-        cl_double4 dval;
-    } val;
-    switch(dst.depth())
-    {
-    case CV_8U:
-        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
-        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
-        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
-        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=uchar");
-            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=uchar4");
-            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_8S:
-        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
-        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
-        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
-        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=char");
-            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=char4");
-            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16U:
-        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
-        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
-        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
-        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=ushort");
-            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=ushort4");
-            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16S:
-        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
-        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
-        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
-        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=short");
-            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=short4");
-            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32S:
-        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
-        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
-        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
-        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=int");
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
-            break;
-        case 2:
-            sprintf(compile_option, "-D GENTYPE=int2");
-            cl_int2 i2val;
-            i2val.s[0] = val.ival.s[0];
-            i2val.s[1] = val.ival.s[1];
-            args.push_back( std::make_pair( sizeof(cl_int2) , (void *)&i2val ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=int4");
-            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32F:
-        val.fval.s[0] = scalar.val[0];
-        val.fval.s[1] = scalar.val[1];
-        val.fval.s[2] = scalar.val[2];
-        val.fval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=float");
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=float4");
-            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_64F:
-        val.dval.s[0] = scalar.val[0];
-        val.dval.s[1] = scalar.val[1];
-        val.dval.s[2] = scalar.val[2];
-        val.dval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=double");
-            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=double4");
-            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    default:
-        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
-    }
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char channelMap[] = { ' ', ' ', '2', '4', '4' };
+    std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
+
+    Mat mat(1, 1, dst.type(), scalar);
+
 #ifdef CL_VERSION_1_2
-    //this enables backwards portability to
-    //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-    if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
+    // this enables backwards portability to
+    // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
+    if (Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
         dst.offset == 0 && dst.cols == dst.wholecols)
     {
+        const int sizeofMap[][7] =
+            {
+                { sizeof(cl_uchar) , sizeof(cl_char) , sizeof(cl_ushort) , sizeof(cl_short) , sizeof(cl_int) , sizeof(cl_float) , sizeof(cl_double)  },
+                { sizeof(cl_uchar2), sizeof(cl_char2), sizeof(cl_ushort2), sizeof(cl_short2), sizeof(cl_int2), sizeof(cl_float2), sizeof(cl_double2) },
+                { 0                , 0               , 0                 , 0                , 0              , 0                ,  0                 },
+                { sizeof(cl_uchar4), sizeof(cl_char4), sizeof(cl_ushort4), sizeof(cl_short4), sizeof(cl_int4), sizeof(cl_float4), sizeof(cl_double4) },
+            };
+        int sizeofGeneric = sizeofMap[dst.oclchannels() - 1][dst.depth()];
+
         clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
-            (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
+                            (cl_mem)dst.data, (void*)mat.data, sizeofGeneric,
+                            0, dst.step * dst.rows, 0, NULL, NULL);
     }
     else
 #endif
     {
+        oclMat m(mat);
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void*)&m.data ));
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
+
         openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
-            localThreads, args, -1, -1, compile_option);
+            localThreads, args, -1, -1, buildOptions.c_str());
     }
 }
 
@@ -635,161 +491,16 @@ static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const o
 {
     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
     std::vector<std::pair<size_t , const void *> > args;
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
+    size_t localThreads[3] = { 16, 16, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
-    char compile_option[32];
-    union sc
-    {
-        cl_uchar4 uval;
-        cl_char4  cval;
-        cl_ushort4 usval;
-        cl_short4 shval;
-        cl_int4 ival;
-        cl_float4 fval;
-        cl_double4 dval;
-    } val;
-    switch(dst.depth())
-    {
-    case CV_8U:
-        val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
-        val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
-        val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
-        val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=uchar");
-            args.push_back( std::make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=uchar4");
-            args.push_back( std::make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_8S:
-        val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
-        val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
-        val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
-        val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=char");
-            args.push_back( std::make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=char4");
-            args.push_back( std::make_pair( sizeof(cl_char4) , (void *)&val.cval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16U:
-        val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
-        val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
-        val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
-        val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=ushort");
-            args.push_back( std::make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=ushort4");
-            args.push_back( std::make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_16S:
-        val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
-        val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
-        val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
-        val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=short");
-            args.push_back( std::make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=short4");
-            args.push_back( std::make_pair( sizeof(cl_short4) , (void *)&val.shval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32S:
-        val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
-        val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
-        val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
-        val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=int");
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=int4");
-            args.push_back( std::make_pair( sizeof(cl_int4) , (void *)&val.ival ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_32F:
-        val.fval.s[0] = scalar.val[0];
-        val.fval.s[1] = scalar.val[1];
-        val.fval.s[2] = scalar.val[2];
-        val.fval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=float");
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=float4");
-            args.push_back( std::make_pair( sizeof(cl_float4) , (void *)&val.fval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    case CV_64F:
-        val.dval.s[0] = scalar.val[0];
-        val.dval.s[1] = scalar.val[1];
-        val.dval.s[2] = scalar.val[2];
-        val.dval.s[3] = scalar.val[3];
-        switch(dst.oclchannels())
-        {
-        case 1:
-            sprintf(compile_option, "-D GENTYPE=double");
-            args.push_back( std::make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
-            break;
-        case 4:
-            sprintf(compile_option, "-D GENTYPE=double4");
-            args.push_back( std::make_pair( sizeof(cl_double4) , (void *)&val.dval ));
-            break;
-        default:
-            CV_Error(Error::StsUnsupportedFormat, "unsupported channels");
-        }
-        break;
-    default:
-        CV_Error(Error::StsUnsupportedFormat, "unknown depth");
-    }
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char channelMap[] = { ' ', ' ', '2', '4', '4' };
+    std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
+
+    oclMat m(Mat(1, 1, dst.type(), scalar));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&m.data ));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
@@ -799,38 +510,21 @@ static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const o
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
     openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
-                        localThreads, args, -1, -1, compile_option);
+                        localThreads, args, -1, -1, buildOptions.c_str());
 }
 
 oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
 {
-    //cout << "cv::ocl::oclMat::setTo()" << endl;
     CV_Assert(mask.type() == CV_8UC1);
     CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
     CV_DbgAssert( !this->empty());
-    //cl_int status;
-    //cl_mem mem;
-    //mem = clCreateBuffer(this->clCxt->clContext,CL_MEM_READ_WRITE,
-    //                   sizeof(double)*4,NULL,&status);
-    //openCLVerifyCall(status);
-    //double* s =  (double *)scalar.val;
-    //openCLSafeCall(clEnqueueWriteBuffer(this->clCxt->clCmdQueue,
-    //                   (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
     if (mask.empty())
     {
-        if(type() == CV_8UC1)
-        {
-            set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0");
-        }
-        else
-        {
-            set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
-        }
+        set_to_withoutmask_run(*this, scalar, type() == CV_8UC1 ?
+                                   "set_to_without_mask_C1_D0" : "set_to_without_mask");
     }
     else
-    {
         set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
-    }
 
     return *this;
 }
@@ -845,79 +539,38 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
     oclMat hdr = *this;
 
     int cn = oclchannels();
-
     if (new_cn == 0)
-
         new_cn = cn;
 
-
-
     int total_width = cols * cn;
-
-
-
     if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
-
         new_rows = rows * total_width / new_cn;
 
-
-
     if (new_rows != 0 && new_rows != rows)
-
     {
-
         int total_size = total_width * rows;
 
-
-
         if (!isContinuous())
-
             CV_Error(Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
 
-
-
         if ((unsigned)new_rows > (unsigned)total_size)
-
             CV_Error(Error::StsOutOfRange, "Bad new number of rows");
 
-
-
         total_width = total_size / new_rows;
-
-
-
         if (total_width * new_rows != total_size)
-
             CV_Error(Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
 
-
-
         hdr.rows = new_rows;
-
         hdr.step = total_width * elemSize1();
-
     }
 
-
-
     int new_width = total_width / new_cn;
-
-
-
     if (new_width * new_cn != total_width)
-
         CV_Error(Error::BadNumChannels, "The total width is not divisible by the new number of channels");
 
-
-
     hdr.cols = new_width;
-
     hdr.wholecols = new_width;
-
     hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
-
-
-
     return hdr;
 
 }
@@ -973,7 +626,6 @@ void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
 
 void cv::ocl::oclMat::release()
 {
-    //cout << "cv::ocl::oclMat::release()" << endl;
     if( refcount && CV_XADD(refcount, -1) == 1 )
     {
         fastFree(refcount);
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index c520cb8d4..2857f85e6 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -71,12 +71,6 @@ namespace cv
 {
     namespace ocl
     {
-
-        inline int divUp(int total, int grain)
-        {
-            return (total + grain - 1) / grain;
-        }
-
         // provide additional methods for the user to interact with the command queue after a task is fired
         static void openCLExecuteKernel_2(Context *clCxt , const char **source, String kernelName, size_t globalThreads[3],
                                    size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
diff --git a/modules/ocl/src/opencl/arithm_2_mat.cl b/modules/ocl/src/opencl/arithm_2_mat.cl
deleted file mode 100644
index 63c1ccac0..000000000
--- a/modules/ocl/src/opencl/arithm_2_mat.cl
+++ /dev/null
@@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
-#define CV_PI   3.1415926535897932384626433832795
-
-char round_char(double v){
-    char v1=(char)v;
-    return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round_short(double v){
-    short v1=(short)v;
-    return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round_int(double v){
-    int v1=(int)v;
-    return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-char round2_char(double v){
-    char v1=(char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round2_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round2_short(double v){
-    short v1=(short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round2_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round2_int(double v){
-    int v1=(int)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-/*****************************************EXP***************************************/
-__kernel void arithm_op_exp_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] = (float)exp((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_exp_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = exp(src1Mat[idx]);
-    }
-}
-
-/*****************************************LOG***************************************/
-__kernel void arithm_op_log_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] =(float) log((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_log_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = log(src1Mat[idx]);
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_LUT.cl b/modules/ocl/src/opencl/arithm_LUT.cl
index 624da0008..ff21e9a31 100644
--- a/modules/ocl/src/opencl/arithm_LUT.cl
+++ b/modules/ocl/src/opencl/arithm_LUT.cl
@@ -38,125 +38,66 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 
-__kernel
-void LUT_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C1( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0)<<2;
-    int gidy = get_global_id(1);
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    int x1 = get_global_id(0);
+    int y = get_global_id(1);
 
-    __local uchar l[256];
-    l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= cols-4?cols-4:gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    p.x = src[src_index];
-    p.y = src[src_index+1];
-    p.z = src[src_index+2];
-    p.w = src[src_index+3];
-
-    q.x = l[p.x];
-    q.y = l[p.y];
-    q.z = l[p.z];
-    q.w = l[p.w];
-    *(__global uchar4*)(dst + dst_index) = q;
-}
-
-__kernel
-void LUT2_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int precols,
-      int channels,
-      int whole_rows,
-      int cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    //int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    __local uchar l[256];
-    l[lidy] = table[lidy+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= precols ? cols+gidx : gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    //uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uchar p = src[src_index];
-    uchar q = l[p];
-    dst[dst_index] = q;
-}
-
-__kernel
-void LUT_C4_D0( __global uchar4 *dst,
-      __global uchar4 *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int src_index = mad24(gidy,src_step,gidx+src_offset);
-    int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
-    __local uchar l[256];
-    l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(gidx<cols && gidy<rows)
+    if (x1 < cols1 && y < rows)
     {
-        uchar4 p = src[src_index];
-        uchar4 q;
-        q.x = l[p.x];
-        q.y = l[p.y];
-        q.z = l[p.z];
-        q.w = l[p.w];
-        dst[dst_index] = q;
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index] = lut[lut_offset1 + src[src_index]];
+    }
+}
+
+__kernel void LUT_C2( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
+{
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);
+
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 1)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 1) + 1] : dst[dst_index + 1];
+    }
+}
+
+__kernel void LUT_C4( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 2)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 2) + 1] : dst[dst_index + 1];
+        dst[dst_index + 2] = x1 + 2 < cols1 ? lut[lut_offset1 + (src[src_index + 2] << 2) + 2] : dst[dst_index + 2];
+        dst[dst_index + 3] = x1 + 3 < cols1 ? lut[lut_offset1 + (src[src_index + 3] << 2) + 3] : dst[dst_index + 3];
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl
deleted file mode 100644
index 341a0048f..000000000
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ /dev/null
@@ -1,970 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////absdiff////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************adddiff *************************************/
-__kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                 __global uchar *src2, int src2_step, int src2_offset,
-                                 __global uchar *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                 __global ushort *src2, int src2_step, int src2_offset,
-                                 __global ushort *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_offset,
-                                 __global short *src2, int src2_step, int src2_offset,
-                                 __global short *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4  dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        ushort4 tmp = abs_diff(src1_data, src2_data);
-        short4  tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_absdiff_D4 (__global int *src1, int src1_step, int src1_offset,
-                                 __global int *src2, int src2_step, int src2_offset,
-                                 __global int *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        uint tmp = abs_diff(data1, data2);
-        int  tmp_data = convert_int_sat(tmp);
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-__kernel void arithm_absdiff_D5 (__global float *src1, int src1_step, int src1_offset,
-                                 __global float *src2, int src2_step, int src2_offset,
-                                 __global float *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = fabs(data1 - data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_absdiff_D6 (__global double *src1, int src1_step, int src1_offset,
-                                 __global double *src2, int src2_step, int src2_offset,
-                                 __global double *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double tmp = fabs(data1-data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-/**************************************absdiff with scalar**************************************/
-__kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        uint tmp_data = abs_diff(src_data1, src_data2);
-        int  data = convert_int_sat(tmp_data);
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = fabs(src_data1 - src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = fabs(src_data1 - src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = convert_ushort2_sat( abs_diff(convert_int2_sat(src_data1), src_data2));
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src_data1), src_data2));
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(abs_diff(src_data1, src_data2));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = fabs(src_data1 - src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = fabs(src_data1 - src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
-        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
-        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
-        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
-        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
-        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
-        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
-        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
-        uchar4 tmp_data_0 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_0), src2_data_0));
-        uchar4 tmp_data_1 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_1), src2_data_1));
-        uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
-
-        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_0.w : data_0.w;
-
-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.zw : data_1.zw;
-
-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
-                     ? tmp_data_2.yzw : data_2.yzw;
-
-        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
-        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
-        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
-        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
-        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
-        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
-        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
-        ushort2 tmp_data_0 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        ushort2 tmp_data_1 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        ushort2 tmp_data_2 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
-        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
-        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
-        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
-        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
-        short2 tmp_data_0 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        short2 tmp_data_1 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        short2 tmp_data_2 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
-        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
-        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
-        int src2_data_0 = src2.x;
-        int src2_data_1 = src2.y;
-        int src2_data_2 = src2.z;
-
-        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
-        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
-        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
-        int tmp_data_0 = convert_int_sat(abs_diff(src1_data_0, src2_data_0));
-        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
-        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
-
-        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
-        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
-        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
-        float src2_data_0 = src2.x;
-        float src2_data_1 = src2.y;
-        float src2_data_2 = src2.z;
-
-        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
-        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
-        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
-        float tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
-
-        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
-        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
-        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
-        double src2_data_0 = src2.x;
-        double src2_data_1 = src2.y;
-        double src2_data_2 = src2.z;
-
-        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
-        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
-        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
-        double tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(abs_diff(src_data1, src2));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = fabs(src_data1 - src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = fabs(src_data1 - src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
index 070ced473..38834e766 100644
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -52,809 +52,105 @@
 #endif
 #endif
 
-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////ADD////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+///////////////////////////////////////////// ADD ////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp      = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
     }
 }
-__kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
 
+__kernel void arithm_binary_op_mat_div(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        int4    tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero : convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
     }
 }
-__kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+
+__kernel void arithm_absdiff_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+        WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]);
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
     }
 }
 
-__kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+// add mat with scale for multiply
+__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        long tmp  = ARITHM_OP((long)(data1), (long)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
     }
 }
-__kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+
+// add mat with scale for divide
+__kernel void arithm_binary_op_mat_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = ARITHM_OP(data1, data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero :
+            convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
     }
 }
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2);
-    }
-}
-#endif
-
-/**************************************add with mask**************************************/
-__kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4   tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl
index e7ed28928..159a970db 100644
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,392 +42,34 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #elif defined (cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
-typedef double F;
-#else
-typedef float F;
 #endif
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                              __global uchar *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global uchar *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
+
+__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global T * dst, int dst_step1, int dst_offset1,
+                              WT alpha, WT beta, WT gama,
+                              int cols1, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
-
+    if (x < cols1 && y < rows)
     {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);
 
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-//        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-        short4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
     }
-
 }
-
-
-
-__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                              __global ushort *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global ushort *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-
-}
-
-
-__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
-                              __global short *src2,  int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global short *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        short4 tmp_data = convert_short4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
-                              __global int *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global int *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> bitOfInt) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-        float4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        int4 tmp_data = convert_int4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
-                              __global float *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global float *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
-        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-        float4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        // float4 tmp_data = convert_float4(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
-                              __global double *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global double *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
-        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-        double4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_add_mask.cl b/modules/ocl/src/opencl/arithm_add_mask.cl
new file mode 100644
index 000000000..52dbfc455
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_add_mask.cl
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// add with mask //////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int src1_offset,
+                              __global T * src2, int src2_step, int src2_offset,
+                              __global uchar * mask, int mask_step, int mask_offset,
+                              __global T * dst, int dst_step, int dst_offset,
+                              int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index  = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
+        }
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
index cdb79f37e..4e0c7fc5f 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -51,463 +51,61 @@
 #endif
 #endif
 
-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar without mask**************************************/
-__kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+///////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Add with scalar /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
+                                 __global WT *scalar,
+                                 __global T *dst,  int dst_step,  int dst_offset,
+                                 int cols, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
     }
 }
-__kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
 
+__kernel void arithm_absdiff_scalar(__global T *src1, int src1_step, int src1_offset,
+                         __global WT *src2,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 1;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+        WT value = convertToWT(src1[src1_index]) - src2[0];
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
     }
 }
-__kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
 
+// scalar divide to matrix
+__kernel void arithm_binary_op_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                               __global WT *scalar,
+                               __global T *dst,  int dst_step,  int dst_offset,
+                               int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 1;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+        T zero = (T)(0);
+        dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
     }
 }
-__kernel void arithm_s_add_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
index a0cb7dacb..5c3408034 100644
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -51,561 +51,28 @@
 #endif
 #endif
 
-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar with mask**************************************/
-__kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Add with scalar with mask ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
 
+__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
+                                     __global WT *scalar,
+                                     __global uchar *mask, int mask_step, int mask_offset,
+                                     __global T *dst,  int dst_step,  int dst_offset,
+                                     int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
         {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int dst_index = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
         }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
     }
 }
-__kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_add_with_mask_C1_D5 (__global   float   *src1, int src1_step, int src1_offset,
-                                            __global   float   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C1_D6 (__global   double   *src1, int src1_step, int src1_offset,
-                                            __global   double   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4   src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4  tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary.cl b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
index 8bdd23c17..898b40a9e 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
@@ -43,303 +43,25 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//bitwise_binary without mask for and, or, xor operators
 
 /////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary///////////////////////////////////////////
+/////////////////////////////////////////// bitwise_binary //////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////////
 
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
-__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                     __global uchar *src2, int src2_step, int src2_offset,
-                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, dst_offset + x);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
     }
 }
-
-
-__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        char4 src1_data = vload4(0, src1 + src1_index_fix);
-        char4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        char4 dst_data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global char4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                     __global ushort *src2, int src2_step, int src2_offset,
-                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset,
-                                     __global short *src2, int src2_step, int src2_offset,
-                                     __global short *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset,
-                                     __global int *src2, int src2_step, int src2_offset,
-                                     __global int *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 OP_BINARY data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 tmp = data1 OP_BINARY data2;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
index 60cd18820..622ab5b11 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
@@ -43,767 +43,31 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with mask**************************************/
-__kernel void arithm_bitwise_binary_with_mask_C1_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
 
+__kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * mask, int mask_step, int mask_offset, int elemSize,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
     {
-        x = x << 2;
+        int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize));
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
     }
 }
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = convert_char((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = convert_char((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int    *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)(src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_with_mask_C4_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0));
-        char8 src_data1_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8));
-        char8 src_data1_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-        char8 src_data1_3 = *((__global char8 *)((__global char *)src1 + src1_index + 24));
-
-        char8 src_data2_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0));
-        char8 src_data2_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8));
-        char8 src_data2_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-        char8 src_data2_3 = *((__global char8 *)((__global char *)src2 + src2_index + 24));
-
-        char8 dst_data_0  = *((__global char8 *)((__global char *)dst  + dst_index + 0));
-        char8 dst_data_1  = *((__global char8 *)((__global char *)dst  + dst_index + 8));
-        char8 dst_data_2  = *((__global char8 *)((__global char *)dst  + dst_index + 16));
-        char8 dst_data_3  = *((__global char8 *)((__global char *)dst  + dst_index + 24));
-
-        char8 data_0 = src_data1_0 OP_BINARY src_data2_0;
-        char8 data_1 = src_data1_1 OP_BINARY src_data2_1;
-        char8 data_2 = src_data1_2 OP_BINARY src_data2_2;
-        char8 data_3 = src_data1_3 OP_BINARY src_data2_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global char8 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global char8 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
index 5fa25004d..c17b412a6 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
@@ -43,596 +43,26 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-/******************************bitwise binary with scalar without mask********************************/
-__kernel void arithm_s_bitwise_binary_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
+
+__kernel void arithm_bitwise_binary_scalar(
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
+        __global uchar *dst, int dst_step, int dst_offset,
+        int cols1, int rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
     {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, src1_offset + x);
+        int src2_index = x % elemSize;
+        int dst_index  = mad24(y, dst_step, dst_offset + x);
 
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
     }
 }
-
-
-__kernel void arithm_s_bitwise_binary_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-
-        int data = src_data1 OP_BINARY src_data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        char4 data  = *((__global char4 *)((__global char *)dst  + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-
-        short2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        char8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        short8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-
-__kernel void arithm_s_bitwise_binary_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-
-        char16 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
-
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
index 9af6589ad..bae1699a3 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
@@ -42,6 +42,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
@@ -50,698 +51,29 @@
 #endif
 #endif
 
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
 
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar  *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 (
-        __global   int   *src1, int src1_step, int src1_offset,
-        __global   int   *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global  char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src1_data OP_BINARY src2_data;
-
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
+__kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
         __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int cols, int rows)
 {
-
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+        int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = x % elemSize;
+            int dst_index = mad24(y, dst_step, x + dst_offset);
 
-        uchar mask_data = *(mask + mask_index);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));
-
-        short8 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
     }
 }
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
-        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
-        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
-        short4 dst_data_3  = *((__global short4 *)((__global char *)dst  + dst_index + 24));
-
-        short4 data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_compare.cl b/modules/ocl/src/opencl/arithm_compare.cl
new file mode 100644
index 000000000..d0842db18
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_compare.cl
@@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_compare(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global uchar * dst, int dst_step1, int dst_offset1,
+                              int cols1, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols1 && y < rows)
+    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);
+
+        dst[dst_index] = convert_uchar(src1[src1_index] Operation src2[src2_index] ? 255 : 0);
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl
deleted file mode 100644
index 16a56acef..000000000
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ /dev/null
@@ -1,1016 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////Compare EQ////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-/***********************************Compare GT**************************/
-__kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-/***********************************Compare GE**************************/
-__kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl
deleted file mode 100644
index fb5859d3b..000000000
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ /dev/null
@@ -1,1013 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-/***********************************Compare NE*******************************/
-__kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-
-/***********************************Compare LT*******************************/
-__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global  uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
-
-/***********************************Compare LE*******************************/
-__kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                    __global uchar *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                    __global ushort *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset,
-                                    __global short *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset,
-                                    __global int *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset,
-                                    __global float *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset,
-                                    __global double *src2, int src2_step, int src2_offset,
-                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                                    int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-        uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl
deleted file mode 100644
index 1dce3853f..000000000
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ /dev/null
@@ -1,468 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-typedef double F ;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#define convert_F  double
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#define convert_F  float
-#endif
-
-inline uchar round2_uchar(F v)
-{
-    return convert_uchar_sat(round(v));
-}
-
-inline ushort round2_ushort(F v)
-{
-    return convert_ushort_sat(round(v));
-}
-
-inline short round2_short(F v)
-{
-    return convert_short_sat(round(v));
-}
-
-inline int round2_int(F v)
-{
-    return convert_int_sat(round(v));
-}
-///////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////divide///////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////
-/**********************************div*********************************************/
-__kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int2 coor = (int2)(get_global_id(0), get_global_id(1));
-
-    if (coor.x < cols && coor.y < rows)
-    {
-        coor.x = coor.x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
-                                mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
-
-        int4 dst_args  = (int4)(mad24(coor.y, dst_step, dst_offset),
-                                mad24(coor.y, dst_step, dst_offset + dst_step1),
-                                mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
-                                0);
-
-        uchar4 src1_data = vload4(0, src1 + src_index.x);
-        uchar4 src2_data = vload4(0, src2 + src_index.y);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_args.z));
-
-        F4 tmp      = convert_F4(src1_data) * scalar;
-        uchar4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
-
-        dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
-    }
-}
-
-__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-
-        F4 tmp   = convert_F4(src1_data) * scalar;
-
-        ushort4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-
-        F4 tmp   = convert_F4(src1_data) * scalar;
-
-        short4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w);
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-
-        F tmp  = (convert_F)(data1) * scalar;
-        int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
-    }
-}
-
-__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-
-        F tmp  = (convert_F)(data1) * scalar;
-        float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2));
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        double tmp  = data1 * scalar;
-        double tmp_data = (tmp == 0 || data2 == 0) ? 0 : (tmp / data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-/************************************div with scalar************************************/
-__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset,
-                               __global uchar *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src_index = mad24(y, src_step, x + src_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src_data = vload4(0, src + src_index);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
-
-        uchar4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset,
-                               __global ushort *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src_data = vload4(0, (__global ushort *)((__global char *)src + src_index));
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-
-        ushort4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset,
-                               __global short *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src_data = vload4(0, (__global short *)((__global char *)src + src_index));
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-
-        short4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w);
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
-                               __global int *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data = *((__global int *)((__global char *)src + src_index));
-
-        int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data));
-
-        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
-    }
-}
-
-__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset,
-                               __global float *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data = *((__global float *)((__global char *)src + src_index));
-
-        float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data));
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset,
-                               __global double *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 3) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data = *((__global double *)((__global char *)src + src_index));
-
-        double tmp_data = (scalar == 0 || data == 0) ? 0 : (scalar / data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_exp.cl b/modules/ocl/src/opencl/arithm_exp.cl
index 6f537a287..b2143ba14 100644
--- a/modules/ocl/src/opencl/arithm_exp.cl
+++ b/modules/ocl/src/opencl/arithm_exp.cl
@@ -42,52 +42,70 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
 
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////EXP//////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-__kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_exp_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows)
+    if(x < cols1 && y < rows)
     {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
-
-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = exp(src_data);
-
-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
 
+        dst[dstIdx] = exp(src[srcIdx]);
     }
 }
 
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_exp_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  if(x < cols && y < rows )
-  {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);
 
-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = exp(src_data);
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
 
-      *((__global double *)((__global char *)dst + dstIdx )) = dst_data;
-     // dst[dstIdx] = exp(src[srcIdx]);
-  }
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+    }
 }
 
-#endif
+__kernel void arithm_exp_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
+
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? exp(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? exp(src[srcIdx + 3]) : dst[dstIdx + 3];
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_log.cl b/modules/ocl/src/opencl/arithm_log.cl
index ea19c9d90..ef8c4dd04 100644
--- a/modules/ocl/src/opencl/arithm_log.cl
+++ b/modules/ocl/src/opencl/arithm_log.cl
@@ -1,4 +1,3 @@
-
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
@@ -43,52 +42,66 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
 
-#define INF_FLOAT -88.029694
-#define INF_DOUBLE -709.0895657128241
-
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////LOG/////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 
-__kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_log_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows )
+    if(x < cols1 && y < rows)
     {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
 
-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = (src_data == 0) ? INF_FLOAT : log(fabs(src_data));
-
-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+        dst[dstIdx] = log(src[srcIdx]);
     }
 }
 
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_log_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-    int x = get_global_id(0);
+    int x1 = get_global_id(0) << 1;
     int y = get_global_id(1);
 
-    if(x < cols && y < rows )
+    if(x1 < cols1 && y < rows)
     {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
-
-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = (src_data == 0) ? INF_DOUBLE : log(fabs(src_data));
-      *((__global double *)((__global char *)dst + dstIdx)) = dst_data;
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
 
+        dst[dstIdx] =                      log(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
+    }
+}
+
+__kernel void arithm_log_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
+
+        dst[dstIdx] =                      log(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? log(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? log(src[srcIdx + 3]) : dst[dstIdx + 3];
     }
 }
-#endif
diff --git a/modules/ocl/src/opencl/arithm_magnitudeSqr.cl b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
deleted file mode 100644
index 3fd697ff1..000000000
--- a/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
+++ /dev/null
@@ -1,177 +0,0 @@
-
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this softwareif advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////magnitudeSqr//////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_offset,
-                           __global float *src2, int src2_step,int src2_offset,
-                           __global float *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-
-    {
-
-        x = x << 2;
-
-        #define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-
-        float4   tmp_data  ;
-      tmp_data.x = src1_data.x * src1_data.x + src2_data.x * src2_data.x;
-
-      tmp_data.y = src1_data.y * src1_data.y + src2_data.y * src2_data.y;
-
-      tmp_data.z = src1_data.z * src1_data.z + src2_data.z * src2_data.z;
-
-      tmp_data.w = src1_data.w * src1_data.w + src2_data.w * src2_data.w;
-
-
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-
-__kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_offset,
-                           __global float *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-
-    {
-
-        x = x << 2;
-
-        #define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-
-        float8 src1_data = vload8(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-
-    if(src1_index==-6)
-          src1_data.s01234567 = src1_data.s67012345;
-    if(src1_index==-4)
-          src1_data.s01234567 = src1_data.s45670123;
-    if(src1_index== -2)
-          src1_data.s01234567 = src1_data.s23456701;
-
-
-
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-
-        float4   tmp_data  ;
-      tmp_data.x = src1_data.s0 * src1_data.s0 + src1_data.s1 * src1_data.s1;
-
-      tmp_data.y = src1_data.s2 * src1_data.s2 + src1_data.s3 * src1_data.s3;
-
-      tmp_data.z = src1_data.s4 * src1_data.s4 + src1_data.s5 * src1_data.s5;
-
-      tmp_data.w = src1_data.s6 * src1_data.s6 + src1_data.s7 * src1_data.s7;
-
-
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
index 1dcb138eb..23b293306 100644
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -44,9 +44,14 @@
 //M*/
 
 /**************************************PUBLICFUNC*************************************/
+
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
 
 #if defined (DEPTH_0)
 #define VEC_TYPE uchar8
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
index 94cc14d25..848aac319 100644
--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
@@ -142,29 +142,35 @@
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
 
 /**************************************Array minMax**************************************/
-__kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+
+__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
                                   __global VEC_TYPE *src, __global RES_TYPE *dst)
 {
    unsigned int lid = get_local_id(0);
    unsigned int gid = get_group_id(0);
    unsigned int  id = get_global_id(0);
    unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
+
+   __local VEC_TYPE localmem_max[128], localmem_min[128];
+   VEC_TYPE minval, maxval, temp;
+
+   __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
+   VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
+
    int idx_c;
-   if(id < elemnum)
+
+   if (id < elemnum)
    {
        temp = src[idx];
        idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
+
+       if (id % cols == 0 )
        {
            repeat_s(temp);
            repeat_s(temploc);
        }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
        {
            repeat_e(temp);
            repeat_e(temploc);
@@ -181,31 +187,33 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        minloc = negative;
        maxloc = negative;
    }
-   float4 aaa;
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   int grainSize = (groupnum << 8);
+   for (id = id + grainSize; id < elemnum; id = id + grainSize)
    {
        idx = offset + id + (id / cols) * invalid_cols;
        temp = src[idx];
        idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
+
+       if (id % cols == 0 )
        {
                repeat_s(temp);
                repeat_s(temploc);
        }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
        {
                repeat_e(temp);
                repeat_e(temploc);
        }
-       minval = min(minval,temp);
-       maxval = max(maxval,temp);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-       aaa= convert_float4(maxval == temp);
-       maxloc = convert_int4(aaa) ? temploc : maxloc;
+
+       minval = min(minval, temp);
+       maxval = max(maxval, temp);
+       minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
+       maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
    }
-   if(lid > 127)
+
+   if (lid > 127)
    {
        localmem_min[lid - 128] = minval;
        localmem_max[lid - 128] = maxval;
@@ -213,29 +221,30 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        localmem_maxloc[lid - 128] = maxloc;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
+
+   if (lid < 128)
    {
        localmem_min[lid] = min(minval,localmem_min[lid]);
        localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
+       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
+       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
    {
-       if(lid < lsize)
+       if (lid < lsize)
        {
            int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+           localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
+           localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-   if( lid == 0)
+
+   if ( lid == 0)
    {
        dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
        dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
@@ -243,138 +252,3 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
    }
 }
-
-#if defined (REPEAT_S0)
-#define repeat_ms(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_ms(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_me(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_me(a) a.s3 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-
-/**************************************Array minMaxLoc mask**************************************/
-/*
-__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global VEC_TYPE *src,
-                                        int minvalid_cols,int moffset,__global uchar4 *mask,__global RES_TYPE  *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp,max_val = MAX_VAL,min_val = MIN_VAL,zero = 0,m_temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = m_temp > zero ? temp : max_val;
-       maxval = m_temp > zero ? temp : min_val;
-       minloc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       maxloc = minloc;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-       minloc = negative;
-       maxloc = negative;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       midx = moffset + id + (id / cols) * minvalid_cols;
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = min(minval,m_temp > zero ? temp : max_val);
-       maxval = max(maxval,m_temp > zero ? temp : min_val);
-
-       temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-   }
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-       localmem_minloc[lid - 128] = minloc;
-       localmem_maxloc[lid - 128] = maxloc;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
-       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
-       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
-       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
-   }
-}
-
-*/
diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl
deleted file mode 100644
index bfbb5942e..000000000
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ /dev/null
@@ -1,303 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-int4 round_int4(float4 v)
-{
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
-
-    return convert_int4_sat(v);
-}
-uint4 round_uint4(float4 v)
-{
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
-
-    return convert_uint4_sat(v);
-}
-long round_int(float v)
-{
-    v = v + (v > 0 ? 0.5 : -0.5);
-
-    return convert_int_sat(v);
-}
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////multiply//////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp      = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
-        tmp = round_int4(convert_float4(tmp) * scalar);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        uint4    tmp = convert_uint4_sat(src1_data) * convert_uint4_sat(src2_data);
-        tmp = round_uint4(convert_float4(tmp) * scalar);
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
-        tmp = round_int4(convert_float4(tmp) * scalar);
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_mul_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 * data2;
-        tmp = round_int((float)tmp * scalar);
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
-    }
-}
-__kernel void arithm_mul_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = data1 * data2;
-        tmp = tmp * scalar;
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        double tmp = data1 * data2;
-        tmp = tmp * scalar;
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-#ifdef DOUBLE_SUPPORT
-#define SCALAR_TYPE double
-#else
-#define SCALAR_TYPE float
-#endif
-
-__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
-                              __global float *dst,  int dst_step,  int dst_offset,
-                              int rows, int cols, int dst_step1, SCALAR_TYPE scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float tmp = data1 * scalar;
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_setidentity.cl b/modules/ocl/src/opencl/arithm_setidentity.cl
new file mode 100644
index 000000000..0604ae81d
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_setidentity.cl
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+
+#if defined (DOUBLE_SUPPORT)
+#define DATA_TYPE double
+#else
+#define DATA_TYPE float
+#endif
+
+__kernel void setIdentityKernel_F1(__global float* src, int src_row, int src_col, int src_step, DATA_TYPE scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < src_col && y < src_row)
+    {
+        if(x == y)
+            src[y * src_step + x] = scalar;
+        else
+            src[y * src_step + x] = 0 * scalar;
+    }
+}
+
+__kernel void setIdentityKernel_D1(__global DATA_TYPE* src, int src_row, int src_col, int src_step, DATA_TYPE scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < src_col && y < src_row)
+    {
+        if(x == y)
+            src[y * src_step + x] = scalar;
+        else
+            src[y * src_step + x] = 0 * scalar;
+    }
+}
+
+__kernel void setIdentityKernel_I1(__global int* src, int src_row, int src_col, int src_step, int scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < src_col && y < src_row)
+    {
+        if(x == y)
+            src[y * src_step + x] = scalar;
+        else
+            src[y * src_step + x] = 0 * scalar;
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl
index d0725b017..5328d1f1b 100644
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
@@ -43,18 +43,23 @@
 //
 //M*/
 
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
 #define TILE_DIM      32
 #define BLOCK_ROWS    8
-#define LDS_STEP     (TILE_DIM + 1)
+#define LDS_STEP      TILE_DIM
 
-
-//8UC1 is not unoptimized, as the size of write per thread is 8
-//which will use completepath
-__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
-                              __global uchar* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
+__kernel void transpose(__global const T* src, __global T* dst,
+    int src_cols, int src_rows,
+    int src_step, int dst_step,
+    int src_offset, int dst_offset)
 {
-
     int gp_x = get_group_id(0),   gp_y = get_group_id(1);
     int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
 
@@ -81,430 +86,54 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
     int x_index = groupId_y * TILE_DIM + lx;
     int y_index = groupId_x * TILE_DIM + ly;
 
-    __local uchar title[TILE_DIM * LDS_STEP];
+    __local T title[TILE_DIM * LDS_STEP];
 
-    if(x < src_cols && y < src_rows)
+    if (x < src_cols && y < src_rows)
     {
         int index_src = mad24(y, src_step, x);
 
-        #pragma unroll
         for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
         {
-            if(y + i < src_rows)
+            if (y + i < src_rows)
             {
-                title[(ly + i) * LDS_STEP + lx] =*(src + src_offset + index_src);
+                title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
                 index_src = mad24(BLOCK_ROWS, src_step, index_src);
             }
         }
-     }
+    }
 
-     barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-    if(x_index < src_rows && y_index < src_cols)
+    if (x_index < src_rows && y_index < src_cols)
     {
         int index_dst = mad24(y_index, dst_step, x_index);
 
-        #pragma unroll
         for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
         {
-            if((y_index + i) < src_cols)
+            if ((y_index + i) < src_cols)
             {
-                *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
+                dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
+                index_dst +=  dst_step * BLOCK_ROWS;
             }
         }
     }
 }
 
-__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
-                              __global int* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
+__kernel void transpose_inplace(__global T* src, __global T* dst,
+    int src_cols, int src_rows,
+    int src_step, int dst_step,
+    int src_offset, int dst_offset)
 {
+    int x = get_global_id(0);
+    int y = get_global_id(1);
 
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
+    if (y < src_rows && x < y)
     {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
+        int srcIdx = mad24(y, src_step, src_offset + x);
+        int dstIdx = mad24(x, dst_step, dst_offset + y);
 
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local int title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global int *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
-                              __global float* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local float title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global float *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-
-__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
-                              __global ushort* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local ushort2 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global ushort2 *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
-                              __global short* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local short2 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global short2 *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
-                              __global uchar* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local uchar4 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global uchar4 *)(src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-
-__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
-                              __global char* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local char4 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global char4 *)(src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
+        T tmp = dst[dstIdx];
+        dst[dstIdx] = src[srcIdx];
+        src[srcIdx] = tmp;
     }
 }
diff --git a/modules/ocl/src/opencl/knearest.cl b/modules/ocl/src/opencl/knearest.cl
new file mode 100644
index 000000000..47af57a7e
--- /dev/null
+++ b/modules/ocl/src/opencl/knearest.cl
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma, jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define TYPE double
+#else
+#define TYPE float
+#endif
+
+#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
+///////////////////////////////////// find_nearest //////////////////////////////////////
+__kernel void knn_find_nearest(__global float* sample, int sample_row, int sample_col, int sample_step,
+                               int k, __global float* samples_ocl, int sample_ocl_row, int sample_ocl_step,
+                               __global float* _results, int _results_step, int _regression, int K1,
+                               int sample_ocl_col, int nThreads, __local float* nr)
+{
+    int k1 = 0;
+    int k2 = 0;
+
+    bool regression = false;
+
+    if(_regression)
+        regression = true;
+
+    TYPE inv_scale;
+#ifdef DOUBLE_SUPPORT
+    inv_scale = 1.0/K1;
+#else
+    inv_scale = 1.0f/K1;
+#endif
+
+    int y = get_global_id(1);
+    int j, j1;
+    int threadY = (y % nThreads);
+    __local float* dd = nr + nThreads * k;
+    if(y >= sample_row)
+    {
+        return;
+    }
+    for(j = 0; j < sample_ocl_row; j++)
+    {
+        TYPE sum;
+#ifdef DOUBLE_SUPPORT
+        sum = 0.0;
+#else
+        sum = 0.0f;
+#endif
+        float si;
+        int t, ii, ii1;
+        for(t = 0; t < sample_col - 16; t += 16)
+        {
+            float16 t0 = vload16(0, sample + y * sample_step + t) - vload16(0, samples_ocl + j * sample_ocl_step + t);
+            t0 *= t0;
+            sum += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
+                t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
+        }
+
+        for(; t < sample_col; t++)
+        {
+#ifdef DOUBLE_SUPPORT
+            double t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
+#else
+            float t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
+#endif
+            sum = sum + t0 * t0;
+        }
+
+        si = (float)sum;
+        for(ii = k1 - 1; ii >= 0; ii--)
+        {
+            if(as_int(si) > as_int(dd[ii * nThreads + threadY]))
+                break;
+        }
+        if(ii < k - 1)
+        {
+            for(ii1 = k2 - 1; ii1 > ii; ii1--)
+            {
+                dd[(ii1 + 1) * nThreads + threadY] = dd[ii1 * nThreads + threadY];
+                nr[(ii1 + 1) * nThreads + threadY] = nr[ii1 * nThreads + threadY];
+            }
+
+            dd[(ii + 1) * nThreads + threadY] = si;
+            nr[(ii + 1) * nThreads + threadY] = samples_ocl[sample_col + j * sample_ocl_step];
+        }
+        k1 = (k1 + 1) < k ? (k1 + 1) : k;
+        k2 = k1 < (k - 1) ? k1 : (k - 1);
+    }
+    /*! find_nearest_neighbor done!*/
+    /*! write_results start!*/
+    switch (regression)
+    {
+    case true:
+        {
+            TYPE s;
+#ifdef DOUBLE_SUPPORT
+            s = 0.0;
+#else
+            s = 0.0f;
+#endif
+            for(j = 0; j < K1; j++)
+                s += nr[j * nThreads + threadY];
+
+            _results[y * _results_step] = (float)(s * inv_scale);
+        }
+        break;
+    case false:
+        {
+            int prev_start = 0, best_count = 0, cur_count;
+            float best_val;
+
+            for(j = K1 - 1; j > 0; j--)
+            {
+                bool swap_f1 = false;
+                for(j1 = 0; j1 < j; j1++)
+                {
+                    if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
+                    {
+                        int t;
+                        CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
+                        swap_f1 = true;
+                    }
+                }
+                if(!swap_f1)
+                    break;
+            }
+
+            best_val = 0;
+            for(j = 1; j <= K1; j++)
+                if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
+                {
+                    cur_count = j - prev_start;
+                    if(best_count < cur_count)
+                    {
+                        best_count = cur_count;
+                        best_val = nr[(j - 1) * nThreads + threadY];
+                    }
+                    prev_start = j;
+                }
+                _results[y * _results_step] = best_val;
+        }
+        break;
+    }
+    ///*! write_results done!*/
+}
diff --git a/modules/ocl/src/opencl/merge_mat.cl b/modules/ocl/src/opencl/merge_mat.cl
index ad3cebb95..19e2340eb 100644
--- a/modules/ocl/src/opencl/merge_mat.cl
+++ b/modules/ocl/src/opencl/merge_mat.cl
@@ -204,7 +204,7 @@ __kernel void merge_vector_C2_D4(__global int *mat_dst,  int dst_step,  int dst_
         int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
         int src1 = *((__global int *)((__global uchar *)mat_src1 + src1_index + (x << 2)));
 
-        *((__global int2 *)((__global uchar *)mat_dst  + dst_index + (x << 4))) = (int2)(src0, src1);
+        *((__global int2 *)((__global uchar *)mat_dst  + dst_index + (x << 3))) = (int2)(src0, src1);
     }
 }
 __kernel void merge_vector_C2_D5(__global float *mat_dst,  int dst_step,  int dst_offset,
@@ -224,7 +224,7 @@ __kernel void merge_vector_C2_D5(__global float *mat_dst,  int dst_step,  int ds
         float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
         float src1 = *((__global float *)((__global uchar *)mat_src1 + src1_index + (x << 2)));
 
-        *((__global float2 *)((__global uchar *)mat_dst  + dst_index + (x << 4))) = (float2)(src0, src1);
+        *((__global float2 *)((__global uchar *)mat_dst  + dst_index + (x << 3))) = (float2)(src0, src1);
     }
 }
 
diff --git a/modules/ocl/src/opencl/operator_convertTo.cl b/modules/ocl/src/opencl/operator_convertTo.cl
index 1a8dd04b9..278d41f7c 100644
--- a/modules/ocl/src/opencl/operator_convertTo.cl
+++ b/modules/ocl/src/opencl/operator_convertTo.cl
@@ -33,352 +33,28 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#define F float
-#define F2 float2
-#define F4 float4
-__kernel void convert_to_S4_C1_D0(
-        __global const int* restrict srcMat,
-        __global uchar* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0)<<2;
-        int y=get_global_id(1);
-        //int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
-        //int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
-        int off_src = (dstoffset_in_pixel & 3);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
-        int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
-        int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
-        if(x+3<cols && y<rows && off_src==0)
-        {
-            float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-            *(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-        }
-        else
-        {
-            if(x+3<cols && y<rows)
-            {
-                float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-                dstMat[dstidx+3] = temp_dst.w;
-            }
-            else if(x+2<cols && y<rows)
-            {
-                float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-            }
-            else if(x+1<cols && y<rows)
-            {
-                float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
-                uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-            }
-            else if(x<cols && y<rows)
-            {
-                dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
-            }
-        }
-}
 
-__kernel void convert_to_S4_C4_D0(
-        __global const int4* restrict srcMat,
-        __global uchar4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
-        }
-}
+#ifdef DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
 
-__kernel void convert_to_S5_C1_D0(
-        __global const float* restrict srcMat,
-        __global uchar* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
+__kernel void convert_to(
+        __global const srcT* restrict srcMat,
+        __global dstT* dstMat,
+        int cols1, int rows,
+        int sstep1, int soffset1,
+        int dstep1, int doffset1,
+        float alpha, float beta)
 {
-        int x=get_global_id(0)<<2;
-        int y=get_global_id(1);
-        //int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
-        //int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
-        int off_src = (dstoffset_in_pixel & 3);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
-        int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
-        int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
-        if(x+3<cols && y<rows && off_src==0)
-        {
-            float4 temp_src = vload4(0,srcMat+srcidx);
-            *(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-        }
-        else
-        {
-            if(x+3<cols && y<rows)
-            {
-                float4 temp_src = vload4(0,srcMat+srcidx);
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-                dstMat[dstidx+3] = temp_dst.w;
-            }
-            else if(x+2<cols && y<rows)
-            {
-                float4 temp_src = vload4(0,srcMat+srcidx);
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-            }
-            else if(x+1<cols && y<rows)
-            {
-                float2 temp_src = vload2(0,srcMat+srcidx);
-                uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-            }
-            else if(x<cols && y<rows)
-            {
-                dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
-            }
-        }
-}
-__kernel void convert_to_S5_C4_D0(
-        __global const float4* restrict srcMat,
-        __global uchar4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = srcMat[srcidx];
-            dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
-        }
-}
+        int x = get_global_id(0);
+        int y = get_global_id(1);
 
-__kernel void convert_to_S0_C1_D4(
-        __global const uchar* restrict srcMat,
-        __global int* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
+        int srcidx = mad24(y, sstep1, x + soffset1);
+        int dstidx = mad24(y, dstep1, x + doffset1);
+
+        if ( (x < cols1) && (y < rows) )
         {
             float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S5_C1_D4(
-        __global const float* restrict srcMat,
-        __global int* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float temp_src = srcMat[srcidx];
-            dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S0_C4_D4(
-        __global const uchar4* restrict srcMat,
-        __global int4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S5_C4_D4(
-        __global const float4* restrict srcMat,
-        __global int4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = srcMat[srcidx];
-            dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S0_C1_D5(
-        __global const uchar* restrict srcMat,
-        __global float* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
-        }
-}
-
-__kernel void convert_to_S4_C1_D5(
-        __global const int* restrict srcMat,
-        __global float* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
-        }
-}
-
-__kernel void convert_to_S0_C4_D5(
-        __global const uchar4* restrict srcMat,
-        __global float4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
-        }
-}
-
-__kernel void convert_to_S4_C4_D5(
-        __global const int4* restrict srcMat,
-        __global float4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
+            dstMat[dstidx] = convertToDstType(temp_src*alpha+beta);
         }
 }
diff --git a/modules/ocl/src/opencl/operator_copyToM.cl b/modules/ocl/src/opencl/operator_copyToM.cl
index c49c6a323..69b5ea4ab 100644
--- a/modules/ocl/src/opencl/operator_copyToM.cl
+++ b/modules/ocl/src/opencl/operator_copyToM.cl
@@ -34,6 +34,14 @@
 //
 //
 
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
 __kernel void copy_to_with_mask(
         __global const GENTYPE* restrict srcMat,
         __global GENTYPE* dstMat,
@@ -47,16 +55,17 @@ __kernel void copy_to_with_mask(
         int maskStep,
         int maskoffset)
 {
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        x = x< cols ? x: cols-1;
-        y = y< rows ? y: rows-1;
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+    int x=get_global_id(0);
+    int y=get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
         int maskidx = mad24(y,maskStep,x+ maskoffset);
-        uchar mask = maskMat[maskidx];
-        if (mask)
+        if ( maskMat[maskidx])
         {
+            int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
+            int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
             dstMat[dstidx] = srcMat[srcidx];
         }
+    }
 }
diff --git a/modules/ocl/src/opencl/operator_setTo.cl b/modules/ocl/src/opencl/operator_setTo.cl
index 0075dc5b5..1d2ad6597 100644
--- a/modules/ocl/src/opencl/operator_setTo.cl
+++ b/modules/ocl/src/opencl/operator_setTo.cl
@@ -34,17 +34,22 @@
 //
 //
 
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
 
-__kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
+__kernel void set_to_without_mask_C1_D0(__global uchar * scalar,__global uchar * dstMat,
         int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
 {
         int x=get_global_id(0)<<2;
         int y=get_global_id(1);
-        //int addr_start = mad24(y,dstStep_in_pixel,offset_in_pixel);
-        //int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
         int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
         uchar4 out;
-        out.x = out.y = out.z = out.w = scalar;
+        out.x = out.y = out.z = out.w = scalar[0];
 
         if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
         {
@@ -77,14 +82,14 @@ __kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
         }
 }
 
-__kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat,
-        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
+__kernel void set_to_without_mask(__global GENTYPE * scalar,__global GENTYPE * dstMat,
+        int cols, int rows, int dstStep_in_pixel, int offset_in_pixel)
 {
-        int x=get_global_id(0);
-        int y=get_global_id(1);
+        int x = get_global_id(0);
+        int y = get_global_id(1);
         if ( (x < cols) & (y < rows))
         {
-            int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
-            dstMat[idx] = scalar;
+            int idx = mad24(y, dstStep_in_pixel, x + offset_in_pixel);
+            dstMat[idx] = scalar[0];
         }
 }
diff --git a/modules/ocl/src/opencl/operator_setToM.cl b/modules/ocl/src/opencl/operator_setToM.cl
index dde12d86f..a1cb092f8 100644
--- a/modules/ocl/src/opencl/operator_setToM.cl
+++ b/modules/ocl/src/opencl/operator_setToM.cl
@@ -33,8 +33,17 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
 __kernel void set_to_with_mask(
-        GENTYPE scalar,
+        __global GENTYPE * scalar,
         __global GENTYPE * dstMat,
         int cols,
         int rows,
@@ -44,16 +53,16 @@ __kernel void set_to_with_mask(
         int maskStep,
         int maskoffset)
 {
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        x = x< cols ? x: cols-1;
-        y = y< rows ? y: rows-1;
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        int maskidx = mad24(y,maskStep,x+ maskoffset);
-        uchar mask = maskMat[maskidx];
-        if (mask)
-        {
-            dstMat[dstidx] = scalar;
-        }
+    int x = get_global_id(0);
+    int y = get_global_id(1);
 
+    if (x < cols && y < rows)
+    {
+        int maskidx = mad24(y,maskStep,x+ maskoffset);
+        if (maskMat[maskidx])
+        {
+            int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
+            dstMat[dstidx] = scalar[0];
+        }
+    }
 }
diff --git a/modules/ocl/src/opencl/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl
index 9fe8e8a97..e40ad3492 100644
--- a/modules/ocl/src/opencl/pyr_down.cl
+++ b/modules/ocl/src/opencl/pyr_down.cl
@@ -43,37 +43,6 @@
 //
 //M*/
 
-//#pragma OPENCL EXTENSION cl_amd_printf : enable
-
-
-uchar round_uchar_int(int v)
-{
-    return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0);
-}
-
-uchar round_uchar_float(float v)
-{
-    return round_uchar_int(convert_int_sat_rte(v));
-}
-
-uchar4 round_uchar4_int4(int4 v)
-{
-    uchar4 result;
-    result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0);
-    result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0);
-    result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0);
-    result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0);
-    return result;
-}
-
-uchar4 round_uchar4_float4(float4 v)
-{
-    return round_uchar4_int4(convert_int4_sat_rte(v));
-}
-
-
-
-
 int idx_row_low(int y, int last_row)
 {
     return abs(y) % (last_row + 1);
@@ -104,6 +73,10 @@ int idx_col(int x, int last_col)
     return idx_col_low(idx_col_high(x, last_col), last_col);
 }
 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC1 ///////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
 __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstCols)
 {
     const int x = get_global_id(0);
@@ -211,10 +184,14 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows,
         const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
 
         if (dst_x < dstCols)
-            dst[y * dstStep + dst_x] = round_uchar_float(sum);
+            dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum);
     }
 }
 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_8UC4 ///////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
 __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols)
 {
     const int x = get_global_id(0);
@@ -228,16 +205,16 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
     const int last_row = srcRows - 1;
     const int last_col = srcCols - 1;
 
-    float4 co1 = 0.375f;//(float4)(0.375f, 0.375f, 0.375f, 0.375f);
-    float4 co2 = 0.25f;//(float4)(0.25f, 0.25f, 0.25f, 0.25f);
-    float4 co3 = 0.0625f;//(float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+    float4 co1 = 0.375f;
+    float4 co2 = 0.25f;
+    float4 co3 = 0.0625f;
 
     if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
     {
         sum =       co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x]));
-        sum = sum + co2   * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
-        sum = sum + co1  * convert_float4((((srcData + (src_y    ) * srcStep / 4))[x]));
-        sum = sum + co2   * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
+        sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
+        sum = sum + co1 * convert_float4((((srcData + (src_y    ) * srcStep / 4))[x]));
+        sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
         sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x]));
 
         smem[2 + get_local_id(0)] = sum;
@@ -247,9 +224,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
             const int left_x = x - 2;
 
             sum =       co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x]));
-            sum = sum + co2   * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
-            sum = sum + co1  * convert_float4((((srcData + (src_y    ) * srcStep / 4))[left_x]));
-            sum = sum + co2   * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
+            sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
+            sum = sum + co1 * convert_float4((((srcData + (src_y    ) * srcStep / 4))[left_x]));
+            sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
             sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x]));
 
             smem[get_local_id(0)] = sum;
@@ -260,9 +237,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
             const int right_x = x + 2;
 
             sum =       co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x]));
-            sum = sum + co2   * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
-            sum = sum + co1  * convert_float4((((srcData + (src_y    ) * srcStep / 4))[right_x]));
-            sum = sum + co2   * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
+            sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
+            sum = sum + co1 * convert_float4((((srcData + (src_y    ) * srcStep / 4))[right_x]));
+            sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
             sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x]));
 
             smem[4 + get_local_id(0)] = sum;
@@ -273,9 +250,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
         int col = idx_col(x, last_col);
 
         sum =       co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
-        sum = sum + co2   * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
-        sum = sum + co1  * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
-        sum = sum + co2   * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
+        sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
+        sum = sum + co1 * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
+        sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
         sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
 
         smem[2 + get_local_id(0)] = sum;
@@ -287,9 +264,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
             col = idx_col(left_x, last_col);
 
             sum =       co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
-            sum = sum + co2   * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
-            sum = sum + co1  * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
-            sum = sum + co2   * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
+            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
+            sum = sum + co1 * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
+            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
             sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
 
             smem[get_local_id(0)] = sum;
@@ -302,9 +279,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
             col = idx_col(right_x, last_col);
 
             sum =       co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
-            sum = sum + co2   * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
-            sum = sum + co1  * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
-            sum = sum + co2   * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
+            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
+            sum = sum + co1 * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
+            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
             sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
 
             smem[4 + get_local_id(0)] = sum;
@@ -318,18 +295,490 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows,
         const int tid2 = get_local_id(0) * 2;
 
         sum =       co3 * smem[2 + tid2 - 2];
-        sum = sum + co2   * smem[2 + tid2 - 1];
-        sum = sum + co1  * smem[2 + tid2    ];
-        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co2 * smem[2 + tid2 - 1];
+        sum = sum + co1 * smem[2 + tid2    ];
+        sum = sum + co2 * smem[2 + tid2 + 1];
         sum = sum + co3 * smem[2 + tid2 + 2];
 
         const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
 
         if (dst_x < dstCols)
-            dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum);
+            dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum);
     }
 }
 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC1 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
+__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
+    {
+        sum =       0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
+        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
+        sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + (src_y    ) * srcStep))[x];
+        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
+        sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
+            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + (src_y    ) * srcStep))[left_x];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
+            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
+            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + (src_y    ) * srcStep))[right_x];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
+            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+    else
+    {
+        int col = idx_col(x, last_col);
+
+        sum =       0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
+        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
+        sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
+        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
+        sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            col = idx_col(left_x, last_col);
+
+            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
+            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
+            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            col = idx_col(right_x, last_col);
+
+            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
+            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
+            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum =       0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16UC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
+__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    float4 co1 = 0.375f;
+    float4 co2 = 0.25f;
+    float4 co3 = 0.0625f;
+
+    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
+    {
+        sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
+        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
+        sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x]);
+        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
+        sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
+            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
+            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
+            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
+            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+    else
+    {
+        int col = idx_col(x, last_col);
+
+        sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
+        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
+        sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
+        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
+        sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            col = idx_col(left_x, last_col);
+
+            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            col = idx_col(right_x, last_col);
+
+            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum =       co3 * smem[2 + tid2 - 2];
+        sum = sum + co2 * smem[2 + tid2 - 1];
+        sum = sum + co1 * smem[2 + tid2    ];
+        sum = sum + co2 * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16SC1 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
+__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+
+    __local float smem[256 + 4];
+
+    float sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
+    {
+        sum =       0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
+        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
+        sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + (src_y    ) * srcStep))[x];
+        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
+        sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            sum =       0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
+            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + (src_y    ) * srcStep))[left_x];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
+            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            sum =       0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
+            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + (src_y    ) * srcStep))[right_x];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
+            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+    else
+    {
+        int col = idx_col(x, last_col);
+
+        sum =       0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
+        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
+        sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
+        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
+        sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            col = idx_col(left_x, last_col);
+
+            sum =       0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
+            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
+            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            col = idx_col(right_x, last_col);
+
+            sum =       0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
+            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
+            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
+            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum =       0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_16SC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
+__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols)
+{
+    const int x = get_global_id(0);
+    const int y = get_group_id(1);
+
+    __local float4 smem[256 + 4];
+
+    float4 sum;
+
+    const int src_y = 2*y;
+    const int last_row = srcRows - 1;
+    const int last_col = srcCols - 1;
+
+    float4 co1 = 0.375f;
+    float4 co2 = 0.25f;
+    float4 co3 = 0.0625f;
+
+    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
+    {
+        sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
+        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
+        sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x]);
+        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
+        sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
+            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
+            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
+            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
+            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+    else
+    {
+        int col = idx_col(x, last_col);
+
+        sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
+        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
+        sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
+        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
+        sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
+
+        smem[2 + get_local_id(0)] = sum;
+
+        if (get_local_id(0) < 2)
+        {
+            const int left_x = x - 2;
+
+            col = idx_col(left_x, last_col);
+
+            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
+
+            smem[get_local_id(0)] = sum;
+        }
+
+        if (get_local_id(0) > 253)
+        {
+            const int right_x = x + 2;
+
+            col = idx_col(right_x, last_col);
+
+            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
+            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
+            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
+
+            smem[4 + get_local_id(0)] = sum;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < 128)
+    {
+        const int tid2 = get_local_id(0) * 2;
+
+        sum =       co3 * smem[2 + tid2 - 2];
+        sum = sum + co2 * smem[2 + tid2 - 1];
+        sum = sum + co1 * smem[2 + tid2    ];
+        sum = sum + co2 * smem[2 + tid2 + 1];
+        sum = sum + co3 * smem[2 + tid2 + 2];
+
+        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
+
+        if (dst_x < dstCols)
+            dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC1 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
 __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols)
 {
     const int x = get_global_id(0);
@@ -441,6 +890,10 @@ __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows,
     }
 }
 
+///////////////////////////////////////////////////////////////////////
+//////////////////////////  CV_32FC4 //////////////////////////////////
+///////////////////////////////////////////////////////////////////////
+
 __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols)
 {
     const int x = get_global_id(0);
@@ -454,16 +907,16 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
     const int last_row = srcRows - 1;
     const int last_col = srcCols - 1;
 
-    float4 co1 = 0.375f;//(float4)(0.375f, 0.375f, 0.375f, 0.375f);
-    float4 co2 = 0.25f;//(float4)(0.25f, 0.25f, 0.25f, 0.25f);
-    float4 co3 = 0.0625f;//(float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
+    float4 co1 = 0.375f;
+    float4 co2 = 0.25f;
+    float4 co3 = 0.0625f;
 
     if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
     {
         sum =       co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x];
-        sum = sum + co2   * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
-        sum = sum + co1  * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x];
-        sum = sum + co2   * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
+        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
+        sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x];
+        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
         sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x];
 
         smem[2 + get_local_id(0)] = sum;
@@ -473,9 +926,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
             const int left_x = x - 2;
 
             sum =       co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
-            sum = sum + co1  * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
+            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
             sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x];
 
             smem[get_local_id(0)] = sum;
@@ -486,9 +939,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
             const int right_x = x + 2;
 
             sum =       co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
-            sum = sum + co1  * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
+            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
             sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x];
 
             smem[4 + get_local_id(0)] = sum;
@@ -499,9 +952,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
         int col = idx_col(x, last_col);
 
         sum =       co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
-        sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
-        sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
-        sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
+        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
+        sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
+        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
         sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
 
         smem[2 + get_local_id(0)] = sum;
@@ -513,9 +966,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
             col = idx_col(left_x, last_col);
 
             sum =       co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
-            sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
+            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
             sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
 
             smem[get_local_id(0)] = sum;
@@ -528,9 +981,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
             col = idx_col(right_x, last_col);
 
             sum =       co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
-            sum = sum + co1  * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
-            sum = sum + co2   * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
+            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
+            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
             sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
 
             smem[4 + get_local_id(0)] = sum;
@@ -544,9 +997,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows,
         const int tid2 = get_local_id(0) * 2;
 
         sum =       co3 * smem[2 + tid2 - 2];
-        sum = sum + co2   * smem[2 + tid2 - 1];
-        sum = sum + co1  * smem[2 + tid2    ];
-        sum = sum + co2   * smem[2 + tid2 + 1];
+        sum = sum + co2 * smem[2 + tid2 - 1];
+        sum = sum + co1 * smem[2 + tid2    ];
+        sum = sum + co2 * smem[2 + tid2 + 1];
         sum = sum + co3 * smem[2 + tid2 + 2];
 
         const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl
index 4afa7b710..88efa9539 100644
--- a/modules/ocl/src/opencl/pyr_up.cl
+++ b/modules/ocl/src/opencl/pyr_up.cl
@@ -46,230 +46,25 @@
 //
 //M*/
 
-//#pragma OPENCL EXTENSION cl_amd_printf : enable
+///////////////////////////////////////////////////////////////////////
+////////////////////////  Generic PyrUp  //////////////////////////////
+///////////////////////////////////////////////////////////////////////
 
-uchar get_valid_uchar(float data)
-{
-    return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
-}
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_8UC1  //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
-                          int srcRows,int dstRows,int srcCols,int dstCols,
-                          int srcOffset,int dstOffset,int srcStep,int dstStep)
+__kernel void pyrUp(__global Type* src, __global Type* dst,
+                          int srcRows, int dstRows, int srcCols, int dstCols,
+                          int srcOffset, int dstOffset, int srcStep, int dstStep)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    __local float s_srcPatch[10][10];
-    __local float s_dstPatch[20][16];
-    const int tidx = get_local_id(0);
-    const int tidy = get_local_id(1);
+
     const int lsizex = get_local_size(0);
     const int lsizey = get_local_size(1);
 
-    if( tidx < 10 && tidy < 10 )
-    {
-        int srcx = mad24((int)get_group_id(0), (lsizex>>1), tidx) - 1;
-        int srcy = mad24((int)get_group_id(1), (lsizey>>1), tidy) - 1;
-
-        srcx = abs(srcx);
-        srcx = min(srcCols - 1,srcx);
-
-        srcy = abs(srcy);
-        srcy = min(srcRows -1 ,srcy);
-
-        s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);
-
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float sum = 0;
-    const int evenFlag = (int)((tidx & 1) == 0);
-    const int oddFlag = (int)((tidx & 1) != 0);
-    const bool  eveny = ((tidy & 1) == 0);
-
-    if(eveny)
-    {
-        sum = (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
-    }
-
-    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
-
-    if (get_local_id(1) < 2)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
-        }
-
-        s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-    }
-
-    if (get_local_id(1) > 13)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-        }
-        s_dstPatch[4 + tidy][tidx] = sum;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    sum = 0;
-
-    sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][tidx];
-    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][tidx];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][tidx];
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
-
-    if ((x < dstCols) && (y < dstRows))
-        dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum);
-
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_16UC1  /////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
-                          int srcRows,int dstRows,int srcCols,int dstCols,
-                          int srcOffset,int dstOffset,int srcStep,int dstStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    __local float s_srcPatch[10][10];
-    __local float s_dstPatch[20][16];
-
-    srcStep = srcStep >> 1;
-    dstStep = dstStep >> 1;
-    srcOffset = srcOffset >> 1;
-    dstOffset = dstOffset >> 1;
-
-
-    if( get_local_id(0) < 10 && get_local_id(1) < 10 )
-    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
-
-        srcx = abs(srcx);
-        srcx = min(srcCols - 1,srcx);
-
-        srcy = abs(srcy);
-        srcy = min(srcRows -1 ,srcy);
-
-        s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]);
-
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float sum = 0;
-
-    const int evenFlag = (int)((get_local_id(0) & 1) == 0);
-    const int oddFlag = (int)((get_local_id(0) & 1) != 0);
-    const bool  eveny = ((get_local_id(1) & 1) == 0);
-    const int tidx = get_local_id(0);
-
-    if(eveny)
-    {
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
-    }
-
-    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
-
-    if (get_local_id(1) < 2)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-        }
-
-        s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-    }
-
-    if (get_local_id(1) > 13)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-        }
-        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    sum = 0;
-
-    const int tidy = get_local_id(1);
-
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][get_local_id(0)];
-    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][get_local_id(0)];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][get_local_id(0)];
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
-
-    if ((x < dstCols) && (y < dstRows))
-        dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
-
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_32FC1  /////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-__kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
-                          int srcRows,int dstRows,int srcCols,int dstCols,
-                          int srcOffset,int dstOffset,int srcStep,int dstStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
     const int tidx = get_local_id(0);
     const int tidy = get_local_id(1);
-    const int lsizex = get_local_size(0);
-    const int lsizey = get_local_size(1);
-    __local float s_srcPatch[10][10];
-    __local float s_dstPatch[20][16];
-
-    srcOffset = srcOffset >> 2;
-    dstOffset = dstOffset >> 2;
-    srcStep = srcStep >> 2;
-    dstStep = dstStep >> 2;
 
+    __local floatType s_srcPatch[10][10];
+    __local floatType s_dstPatch[20][16];
 
     if( tidx < 10 && tidy < 10 )
     {
@@ -282,346 +77,27 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
         srcy = abs(srcy);
         srcy = min(srcRows -1 ,srcy);
 
-        s_srcPatch[tidy][tidx] = (float)(src[srcx + srcy * srcStep]);
-
+        s_srcPatch[tidy][tidx] = convertToFloat(src[srcx + srcy * srcStep]);
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    float sum = 0;
-    const int evenFlag = (int)((tidx & 1) == 0);
-    const int oddFlag = (int)((tidx & 1) != 0);
+    floatType sum = (floatType)0;
+    const floatType evenFlag = (floatType)((tidx & 1) == 0);
+    const floatType oddFlag = (floatType)((tidx & 1) != 0);
     const bool  eveny = ((tidy & 1) == 0);
 
+    const floatType co1 = (floatType)0.375f;
+    const floatType co2 = (floatType)0.25f;
+    const floatType co3 = (floatType)0.0625f;
 
     if(eveny)
     {
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
-    }
-
-    s_dstPatch[2 + tidy][tidx] = sum;
-
-    if (tidy < 2)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 16][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 16][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 16][1 + ((tidx + 2) >> 1)];
-        }
-
-        s_dstPatch[tidy][tidx] = sum;
-    }
-
-    if (tidy > 13)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * 0.375f ) * s_srcPatch[lsizey - 7][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[lsizey - 7][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * 0.0625f) * s_srcPatch[lsizey - 7][1 + ((tidx + 2) >> 1)];
-        }
-        s_dstPatch[4 + tidy][tidx] = sum;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    sum = 0.0625f * s_dstPatch[2 + tidy - 2][tidx];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][tidx];
-    sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][tidx];
-    sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][tidx];
-    sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
-
-    if ((x < dstCols) && (y < dstRows))
-        dst[x + y * dstStep] = (float)(4.0f * sum);
-
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_8UC4  //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
-                          int srcRows,int dstRows,int srcCols,int dstCols,
-                          int srcOffset,int dstOffset,int srcStep,int dstStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int tidx = get_local_id(0);
-    const int tidy = get_local_id(1);
-    const int lsizex = get_local_size(0);
-    const int lsizey = get_local_size(1);
-    __local float4 s_srcPatch[10][10];
-    __local float4 s_dstPatch[20][16];
-
-    srcOffset >>= 2;
-    dstOffset >>= 2;
-    srcStep >>= 2;
-    dstStep >>= 2;
-
-
-    if( tidx < 10 && tidy < 10 )
-    {
-        int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
-        int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;
-
-        srcx = abs(srcx);
-        srcx = min(srcCols - 1,srcx);
-
-        srcy = abs(srcy);
-        srcy = min(srcRows -1 ,srcy);
-
-        s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float4 sum = (float4)(0,0,0,0);
-
-    const float4 evenFlag = (float4)((tidx & 1) == 0);
-    const float4 oddFlag = (float4)((tidx & 1) != 0);
-    const bool  eveny = ((tidy & 1) == 0);
-
-    float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
-    float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
-    float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
-
-
-    if(eveny)
-    {
-        sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + ( evenFlag * co1) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + ( evenFlag * co3) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
-
-    }
-
-    s_dstPatch[2 + tidy][tidx] = sum;
-
-    if (tidy < 2)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-16][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
-        }
-
-        s_dstPatch[tidy][tidx] = sum;
-    }
-
-    if (tidy > 13)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
-
-        }
-        s_dstPatch[4 + tidy][tidx] = sum;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
-    sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
-    sum = sum + co1 * s_dstPatch[2 + tidy    ][tidx];
-    sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
-    sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
-
-    if ((x < dstCols) && (y < dstRows))
-    {
-        dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_16UC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
-                          int srcRows,int dstRows,int srcCols,int dstCols,
-                          int srcOffset,int dstOffset,int srcStep,int dstStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    __local float4 s_srcPatch[10][10];
-    __local float4 s_dstPatch[20][16];
-
-    srcOffset >>= 3;
-    dstOffset >>= 3;
-    srcStep >>= 3;
-    dstStep >>= 3;
-
-
-    if( get_local_id(0) < 10 && get_local_id(1) < 10 )
-    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1;
-
-        srcx = abs(srcx);
-        srcx = min(srcCols - 1,srcx);
-
-        srcy = abs(srcy);
-        srcy = min(srcRows -1 ,srcy);
-
-        s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float4 sum = (float4)(0,0,0,0);
-
-    const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0);
-    const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0);
-    const bool  eveny = ((get_local_id(1) & 1) == 0);
-    const int tidx = get_local_id(0);
-
-    float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
-    float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
-    float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
-
-
-    if(eveny)
-    {
-        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)];
-
-    }
-
-    s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum;
-
-    if (get_local_id(1) < 2)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + (oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-            sum = sum + (oddFlag * co2  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-        }
-
-        s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
-    }
-
-    if (get_local_id(1) > 13)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-
-        }
-        s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    sum = 0;
-
-    const int tidy = get_local_id(1);
-
-    sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)];
-    sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)];
-    sum = sum + co1 * s_dstPatch[2 + tidy    ][get_local_id(0)];
-    sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)];
-    sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)];
-
-    if ((x < dstCols) && (y < dstRows))
-    {
-        dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_32FC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
-                          int srcRows,int dstRows,int srcCols,int dstCols,
-                          int srcOffset,int dstOffset,int srcStep,int dstStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int tidx = get_local_id(0);
-    const int tidy = get_local_id(1);
-    const int lsizex = get_local_size(0);
-    const int lsizey = get_local_size(1);
-    __local float4 s_srcPatch[10][10];
-    __local float4 s_dstPatch[20][16];
-
-    srcOffset >>= 4;
-    dstOffset >>= 4;
-    srcStep >>= 4;
-    dstStep >>= 4;
-
-
-    if( tidx < 10 && tidy < 10 )
-    {
-        int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + tidx) - 1;
-        int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + tidy) - 1;
-
-        srcx = abs(srcx);
-        srcx = min(srcCols - 1,srcx);
-
-        srcy = abs(srcy);
-        srcy = min(srcRows -1 ,srcy);
-
-        s_srcPatch[tidy][tidx] = (float4)(src[srcx + srcy * srcStep]);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float4 sum = (float4)(0,0,0,0);
-
-    const float4 evenFlag = (float4)((tidx & 1) == 0);
-    const float4 oddFlag = (float4)((tidx & 1) != 0);
-    const bool  eveny = ((tidy & 1) == 0);
-
-    float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f);
-    float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f);
-    float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f);
-
-
-    if(eveny)
-    {
-        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
+        sum =       ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
         sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
         sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
         sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
         sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
-
     }
 
     s_dstPatch[2 + tidy][tidx] = sum;
@@ -632,8 +108,8 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
 
         if (eveny)
         {
-            sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
-            sum = sum + (oddFlag * co2  ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
+            sum =       (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
+            sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
             sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx    ) >> 1)];
             sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
             sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
@@ -648,26 +124,23 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
 
         if (eveny)
         {
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
+            sum =       (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
             sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
             sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx    ) >> 1)];
             sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
             sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
-
         }
         s_dstPatch[4 + tidy][tidx] = sum;
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    sum = co3 * s_dstPatch[2 + tidy - 2][tidx];
+    sum =       co3 * s_dstPatch[2 + tidy - 2][tidx];
     sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
     sum = sum + co1 * s_dstPatch[2 + tidy    ][tidx];
     sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
     sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
 
     if ((x < dstCols) && (y < dstRows))
-    {
-        dst[x + y * dstStep] = 4.0f * sum;
-    }
+        dst[x + y * dstStep] = convertToType(4.0f * sum);
 }
diff --git a/modules/ocl/src/opencl/svm.cl b/modules/ocl/src/opencl/svm.cl
new file mode 100644
index 000000000..074ceb059
--- /dev/null
+++ b/modules/ocl/src/opencl/svm.cl
@@ -0,0 +1,209 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Erping Pang, erping@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#define TYPE double
+#else
+#define TYPE float
+#endif
+#if defined ADDEXP
+#define EXP(X) exp(X)
+#else
+#define EXP(X) X
+#endif
+#if defined ADDPOW
+#define POW(X,Y) pow(fabs(X),(Y))
+#else
+#define POW(X,Y) X
+#endif
+#define FLT_MAX   3.402823466e+38F
+#define MAX_VAL   (FLT_MAX*1e-3)
+
+__kernel void svm_linear(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
+                         int width, TYPE alpha, TYPE beta)
+{
+    const int  col = get_global_id(0);
+    const int  row = get_global_id(1);
+
+    if(row < src_rows && col < src2_cols)
+    {
+        int t = 0;
+        TYPE temp = 0.0;
+        for(t = 0; t < width - 16; t += 16)
+        {
+            float16 t0 = vload16(0, src + row * src_step + t);
+            float16 t1 = vload16(0, src2 + col * src2_step + t);
+            t0 *= t1;
+            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
+                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
+        }
+        for(; t < width; t++)
+        {
+            temp += src[row * src_step + t] * src2[col * src2_step + t];
+        }
+
+        TYPE temp1 = (TYPE) (temp * alpha + beta);
+
+        if( temp1 > MAX_VAL )
+        {
+            dst[row * dst_step + col] = MAX_VAL;
+        }
+        else
+        {
+            dst[row * dst_step + col] = temp1;
+        }
+
+    }
+
+}
+__kernel void svm_sigmod(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
+                         int width, TYPE alpha, TYPE beta)
+{
+    const int  col = get_global_id(0);
+    const int  row = get_global_id(1);
+
+    if(row < src_rows && col < src2_cols)
+    {
+        int t = 0;
+        TYPE temp = 0.0;
+        for(t = 0; t < width - 16; t += 16)
+        {
+            float16 t0 = vload16(0, src + row * src_step + t);
+            float16 t1 = vload16(0, src2 + col * src2_step + t);
+            t0 *= t1;
+            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
+                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
+        }
+        for(; t < width; t++)
+        {
+            temp += src[row * src_step + t] * src2[col * src2_step + t];
+        }
+        TYPE tp = (TYPE) (temp * alpha + beta);
+        TYPE e = exp(-fabs(tp));
+        TYPE temp1;
+        if(tp > 0)
+        {
+            temp1 = (TYPE)((1. - e) / (1. + e));
+        }
+        else
+        {
+            temp1 = (TYPE)((e - 1.) / (e + 1.));
+        }
+
+        if( temp1 > MAX_VAL )
+        {
+            dst[row * dst_step + col] = MAX_VAL;
+        }
+        else
+        {
+            dst[row * dst_step + col] = temp1;
+        }
+    }
+
+}
+__kernel void svm_poly(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
+                       int width, TYPE alpha, TYPE beta, TYPE degree)
+{
+    const int  col = get_global_id(0);
+    const int  row = get_global_id(1);
+
+    if(row < src_rows && col < src2_cols)
+    {
+        int t = 0;
+        TYPE temp = 0.0;
+        for(t = 0; t < width - 16; t += 16)
+        {
+            float16 t0 = vload16(0, src + row * src_step + t);
+            float16 t1 = vload16(0, src2 + col * src2_step + t);
+            t0 *= t1;
+            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
+                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
+        }
+        for(; t < width; t++)
+        {
+            temp += src[row * src_step + t] * src2[col * src2_step + t];
+        }
+        TYPE temp1 = (TYPE)(POW((temp * alpha + beta), degree));
+
+        if( temp1 > MAX_VAL )
+        {
+            dst[row * dst_step + col] = MAX_VAL;
+        }
+        else
+        {
+            dst[row * dst_step + col] = temp1;
+        }
+    }
+
+}
+__kernel void svm_rbf(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
+                      int width, TYPE gamma)
+{
+    const int  col = get_global_id(0);
+    const int  row = get_global_id(1);
+
+    if(row < src_rows && col < src2_cols)
+    {
+        int t = 0;
+        TYPE temp = 0.0;
+        for(t = 0; t < width - 16; t += 16)
+        {
+            float16 t0 = vload16(0, src + row * src_step + t);
+            float16 t1 = vload16(0, src2 + col * src2_step + t);
+            t0 = (t0 - t1) * (t0 - t1);
+            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
+                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
+        }
+        for(; t < width; t++)
+        {
+            temp += (src[row * src_step + t] - src2[col * src2_step + t]) * (src[row * src_step + t] - src2[col * src2_step + t]);
+        }
+        TYPE temp1 = EXP((TYPE)(temp * gamma));
+
+        if( temp1 > MAX_VAL )
+        {
+            dst[row * dst_step + col] = MAX_VAL;
+        }
+        else
+        {
+            dst[row * dst_step + col] = temp1;
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/src/optical_flow_farneback.cpp b/modules/ocl/src/optical_flow_farneback.cpp
index e622446bb..a514324f7 100644
--- a/modules/ocl/src/optical_flow_farneback.cpp
+++ b/modules/ocl/src/optical_flow_farneback.cpp
@@ -73,11 +73,6 @@ oclMat gKer;
 
 float ig[4];
 
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
-
 inline void setGaussianBlurKernel(const float *c_gKer, int ksizeHalf)
 {
     cv::Mat t_gKer(1, ksizeHalf + 1, CV_32FC1, const_cast<float *>(c_gKer));
@@ -86,9 +81,9 @@ inline void setGaussianBlurKernel(const float *c_gKer, int ksizeHalf)
 
 static void gaussianBlurOcl(const oclMat &src, int ksizeHalf, oclMat &dst)
 {
-    string kernelName("gaussianBlur");
+    String kernelName("gaussianBlur");
     size_t localThreads[3] = { 256, 1, 1 };
-    size_t globalThreads[3] = { divUp(src.cols, localThreads[0]) * localThreads[0], src.rows, 1 };
+    size_t globalThreads[3] = { src.cols, src.rows, 1 };
     int smem_size = (localThreads[0] + 2*ksizeHalf) * sizeof(float);
 
     CV_Assert(dst.size() == src.size());
@@ -109,7 +104,7 @@ static void gaussianBlurOcl(const oclMat &src, int ksizeHalf, oclMat &dst)
 
 static void polynomialExpansionOcl(const oclMat &src, int polyN, oclMat &dst)
 {
-    string kernelName("polynomialExpansion");
+    String kernelName("polynomialExpansion");
     size_t localThreads[3] = { 256, 1, 1 };
     size_t globalThreads[3] = { divUp(src.cols, localThreads[0] - 2*polyN) * localThreads[0], src.rows, 1 };
     int smem_size = 3 * localThreads[0] * sizeof(float);
@@ -136,12 +131,9 @@ static void polynomialExpansionOcl(const oclMat &src, int polyN, oclMat &dst)
 
 static void updateMatricesOcl(const oclMat &flowx, const oclMat &flowy, const oclMat &R0, const oclMat &R1, oclMat &M)
 {
-    string kernelName("updateMatrices");
+    String kernelName("updateMatrices");
     size_t localThreads[3] = { 32, 8, 1 };
-    size_t globalThreads[3] = { divUp(flowx.cols, localThreads[0]) * localThreads[0],
-                                divUp(flowx.rows, localThreads[1]) * localThreads[1],
-                                1
-                              };
+    size_t globalThreads[3] = { flowx.cols, flowx.rows, 1 };
 
     std::vector< std::pair<size_t, const void *> > args;
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&M.data));
@@ -163,10 +155,10 @@ static void updateMatricesOcl(const oclMat &flowx, const oclMat &flowy, const oc
 
 static void boxFilter5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
 {
-    string kernelName("boxFilter5");
+    String kernelName("boxFilter5");
     int height = src.rows / 5;
     size_t localThreads[3] = { 256, 1, 1 };
-    size_t globalThreads[3] = { divUp(src.cols, localThreads[0]) * localThreads[0], height, 1 };
+    size_t globalThreads[3] = { src.cols, height, 1 };
     int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float);
 
     std::vector< std::pair<size_t, const void *> > args;
@@ -185,13 +177,10 @@ static void boxFilter5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
 
 static void updateFlowOcl(const oclMat &M, oclMat &flowx, oclMat &flowy)
 {
-    string kernelName("updateFlow");
+    String kernelName("updateFlow");
     int cols = divUp(flowx.cols, 4);
     size_t localThreads[3] = { 32, 8, 1 };
-    size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
-                                divUp(flowx.rows, localThreads[1]) * localThreads[0],
-                                1
-                              };
+    size_t globalThreads[3] = { cols, flowx.rows, 1 };
 
     std::vector< std::pair<size_t, const void *> > args;
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowx.data));
@@ -209,11 +198,10 @@ static void updateFlowOcl(const oclMat &M, oclMat &flowx, oclMat &flowy)
 
 static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
 {
-    string kernelName("gaussianBlur5");
+    String kernelName("gaussianBlur5");
     int height = src.rows / 5;
-    int width = src.cols;
     size_t localThreads[3] = { 256, 1, 1 };
-    size_t globalThreads[3] = { divUp(width, localThreads[0]) * localThreads[0], height, 1 };
+    size_t globalThreads[3] = { src.cols, height, 1 };
     int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float);
 
     std::vector< std::pair<size_t, const void *> > args;
@@ -222,7 +210,7 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&gKer.data));
     args.push_back(std::make_pair(smem_size, (void *)NULL));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&height));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&width));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
     args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf));
@@ -324,7 +312,7 @@ void cv::ocl::FarnebackOpticalFlow::prepareGaussian(
 
 void cv::ocl::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
 {
-    vector<float> buf(n*6 + 3);
+    std::vector<float> buf(n*6 + 3);
     float* g = &buf[0] + n;
     float* xg = g + n*2 + 1;
     float* xxg = xg + n*2 + 1;
diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp
index 801a29f25..242dd8fef 100644
--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
@@ -69,24 +69,11 @@ static void pyrdown_run(const oclMat &src, const oclMat &dst)
     CV_Assert(src.depth() != CV_8S);
 
     Context  *clCxt = src.clCxt;
-    //int channels = dst.channels();
-    //int depth = dst.depth();
-
     String kernelName = "pyrDown";
 
-    //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-    //    {4, 0, 4, 4, 1, 1, 1},
-    //    {4, 0, 4, 4, 1, 1, 1},
-    //    {4, 0, 4, 4, 1, 1, 1}
-    //};
-
-    //size_t vector_length = vector_lengths[channels-1][depth];
-    //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-
     size_t localThreads[3]  = { 256, 1, 1 };
     size_t globalThreads[3] = { src.cols, dst.rows, 1};
 
-    //int dst_step1 = dst.cols * dst.elemSize();
     std::vector<std::pair<size_t , const void *> > args;
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
@@ -103,7 +90,9 @@ static void pyrdown_run(const oclMat &src, const oclMat &dst)
 
 void cv::ocl::pyrDown(const oclMat &src, oclMat &dst)
 {
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    int depth = src.depth(), channels = src.channels();
+    CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
+    CV_Assert(channels == 1 || channels == 3 || channels == 4);
 
     dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
 
diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp
index 1f80f4fb3..4fdaac619 100644
--- a/modules/ocl/src/pyrup.cpp
+++ b/modules/ocl/src/pyrup.cpp
@@ -58,13 +58,29 @@ namespace cv
     namespace ocl
     {
         extern const char *pyr_up;
+
         void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst)
         {
+            int depth = src.depth(), channels = src.channels(), oclChannels = src.oclchannels();
+
+            CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
+            CV_Assert(channels == 1 || channels == 3 || channels == 4);
+
             dst.create(src.rows * 2, src.cols * 2, src.type());
 
             Context *clCxt = src.clCxt;
 
+            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float" };
+            char buildOptions[250], convertString[50];
+            const char * const channelsString = oclChannels == 1 ? "" : "4";
+            sprintf(convertString, "convert_%s%s_sat_rte", typeMap[depth], channelsString);
+            sprintf(buildOptions, "-D Type=%s%s -D floatType=float%s -D convertToType=%s -D convertToFloat=%s",
+                    typeMap[depth], channelsString, channelsString,
+                    depth == CV_32F ? "" : convertString,
+                    oclChannels == 4 ? "convert_float4" : "(float)");
+
             const String kernelName = "pyrUp";
+            int dststep = dst.step / dst.elemSize(), srcstep = src.step / src.elemSize();
 
             std::vector< std::pair<size_t, const void *> > args;
             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
@@ -75,14 +91,15 @@ namespace cv
             args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
             args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset));
             args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step));
+            args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep));
+            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep));
 
             size_t globalThreads[3] = {dst.cols, dst.rows, 1};
             size_t localThreads[3]  = {16, 16, 1};
 
 
-            openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
+            openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, -1, -1,
+                                buildOptions);
         }
     }
 }
diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp
index 571193c2f..21b7b2267 100644
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@@ -68,61 +68,6 @@ namespace cv
     {
         namespace split_merge
         {
-            ///////////////////////////////////////////////////////////
-            ///////////////common/////////////////////////////////////
-            /////////////////////////////////////////////////////////
-            inline int divUp(int total, int grain)
-            {
-                return (total + grain - 1) / grain;
-            }
-            ////////////////////////////////////////////////////////////////////////////
-            ////////////////////merge//////////////////////////////////////////////////
-            ////////////////////////////////////////////////////////////////////////////
-            // static void merge_vector_run_no_roi(const oclMat *mat_src, size_t n, oclMat &mat_dst)
-            // {
-            //     Context  *clCxt = mat_dst.clCxt;
-            //     int channels = mat_dst.oclchannels();
-            //     int depth = mat_dst.depth();
-
-            //     String kernelName = "merge_vector";
-
-            //     int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0},
-            //         {4, 4, 2, 2, 1, 1, 1},
-            //         {4, 4, 2, 2 , 1, 1, 1},
-            //         {4, 4, 2, 2, 1, 1, 1}
-            //     };
-
-            //     size_t index = indexes[channels - 1][mat_dst.depth()];
-            //     int    cols = divUp(mat_dst.cols, index);
-            //     size_t localThreads[3]  = { 64, 4, 1 };
-            //     size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-            //                                 divUp(mat_dst.rows, localThreads[1]) *localThreads[1],
-            //                                 1
-            //                               };
-
-            //     std::vector<std::pair<size_t , const void *> > args;
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.rows));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-            //     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst.data));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.step));
-            //     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[0].data));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].step));
-            //     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[1].data));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].step));
-            //     if(n >= 3)
-            //     {
-            //         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
-            //         args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
-            //     }
-            //     if(n >= 4)
-            //     {
-            //         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[3].data));
-            //         args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].step));
-            //     }
-
-            //     openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth);
-            // }
-
             static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
             {
                 if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F)
@@ -148,10 +93,7 @@ namespace cv
                 int cols = divUp(mat_dst.cols + offset_cols, vector_length);
 
                 size_t localThreads[3]  = { 64, 4, 1 };
-                size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                            divUp(mat_dst.rows, localThreads[1]) *localThreads[1],
-                                            1
-                                          };
+                size_t globalThreads[3] = { cols, mat_dst.rows, 1 };
 
                 int dst_step1 = mat_dst.cols * mat_dst.elemSize();
                 std::vector<std::pair<size_t , const void *> > args;
@@ -171,10 +113,6 @@ namespace cv
                     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
                     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));
 
-                    // if channel == 3, then the matrix will convert to channel =4
-                    //if(n == 3)
-                    //   args.push_back( std::make_pair( sizeof(cl_int), (void *)&offset_cols));
-
                     if(n == 3)
                     {
                         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
@@ -224,53 +162,6 @@ namespace cv
                 mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
                 merge_vector_run(mat_src, n, mat_dst);
             }
-            ////////////////////////////////////////////////////////////////////////////////////////////////////
-            //////////////////////////////////////split/////////////////////////////////////////////////////////////
-            //////////////////////////////////////////////////////////////////////////////////////////////////
-            // static void split_vector_run_no_roi(const oclMat &mat_src, oclMat *mat_dst)
-            // {
-            //     Context  *clCxt = mat_src.clCxt;
-            //     int channels = mat_src.oclchannels();
-            //     int depth = mat_src.depth();
-
-            //     String kernelName = "split_vector";
-
-            //     int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0},
-            //         {8, 8, 8, 8, 4, 4, 2},
-            //         {8, 8, 8, 8 , 4, 4, 4},
-            //         {4, 4, 2, 2, 1, 1, 1}
-            //     };
-
-            //     size_t index = indexes[channels - 1][mat_dst[0].depth()];
-            //     int cols = divUp(mat_src.cols, index);
-            //     size_t localThreads[3]  = { 64, 4, 1 };
-            //     size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-            //                                 divUp(mat_src.rows, localThreads[1]) *localThreads[1],
-            //                                 1
-            //                               };
-
-            //     std::vector<std::pair<size_t , const void *> > args;
-            //     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-            //     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
-            //     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
-            //     args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
-            //     if(channels >= 3)
-            //     {
-            //         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
-            //         args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
-            //     }
-            //     if(channels >= 4)
-            //     {
-            //         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
-            //         args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
-            //     }
-
-            //     openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
-            // }
             static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
             {
 
@@ -306,9 +197,7 @@ namespace cv
                             : divUp(mat_src.cols + max_offset_cols, vector_length);
 
                 size_t localThreads[3]  = { 64, 4, 1 };
-                size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
-                                            divUp(mat_src.rows, localThreads[1]) *localThreads[1], 1
-                                          };
+                size_t globalThreads[3] = { cols, mat_src.rows, 1 };
 
                 int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
                 std::vector<std::pair<size_t , const void *> > args;
diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp
index f124488b9..66ff0de31 100644
--- a/modules/ocl/src/stereo_csbp.cpp
+++ b/modules/ocl/src/stereo_csbp.cpp
@@ -96,9 +96,6 @@ namespace cv
     {
         namespace stereoCSBP
         {
-            //////////////////////////////////////////////////////////////////////////
-            //////////////////////////////common////////////////////////////////////
-            ////////////////////////////////////////////////////////////////////////
             static inline int divUp(int total, int grain)
             {
                 return (total + grain - 1) / grain;
@@ -170,7 +167,7 @@ namespace cv
                 const int threadsNum = 256;
                 //size_t blockSize = threadsNum;
                 size_t localThreads[3]  = {win_size, 1, threadsNum / win_size};
-                size_t globalThreads[3] = {w *localThreads[0],
+                size_t globalThreads[3] = { w *localThreads[0],
                     h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2]
                 };
 
@@ -310,7 +307,6 @@ namespace cv
 
                 cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
 
-                //size_t blockSize = 256;
                 size_t localThreads[]  = {32, 8, 1};
                 size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
                     divUp(h, localThreads[1]) *localThreads[1],
@@ -359,8 +355,8 @@ namespace cv
 
                 const size_t threadsNum = 256;
                 //size_t blockSize = threadsNum;
-                size_t localThreads[3]  = {win_size, 1, threadsNum / win_size};
-                size_t globalThreads[3] = {w *localThreads[0],
+                size_t localThreads[3]  = { win_size, 1, threadsNum / win_size };
+                size_t globalThreads[3] = { w *localThreads[0],
                     h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2]
                 };
 
diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp
index a5cbe2b9f..4ba756619 100644
--- a/modules/ocl/src/stereobm.cpp
+++ b/modules/ocl/src/stereobm.cpp
@@ -94,10 +94,7 @@ static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterC
 #define N_DISPARITIES 8
 #define ROWSperTHREAD 21
 #define BLOCK_W 128
-static inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}
+
 ////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////stereoBM_GPU////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////
@@ -115,11 +112,10 @@ static void stereo_bm(const oclMat &left, const oclMat &right,  oclMat &disp,
     size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
                             sizeof(cl_uint);
     //size_t blockSize = 1;
-    size_t localThreads[]  = { BLOCK_W, 1,1};
-    size_t globalThreads[] = { divUp(left.cols - maxdisp - 2 * winsz2, BLOCK_W) *BLOCK_W,
+    size_t localThreads[]  = { BLOCK_W, 1, 1 };
+    size_t globalThreads[] = { left.cols - maxdisp - 2 * winsz2,
                                divUp(left.rows - 2 * winsz2, ROWSperTHREAD),
-                               1
-                             };
+                               1 };
 
     std::vector< std::pair<size_t, const void *> > args;
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
@@ -149,10 +145,9 @@ static void postfilter_textureness(oclMat &left, int winSize,
 
     size_t blockSize = 1;
     size_t localThreads[]  = { BLOCK_W, blockSize ,1};
-    size_t globalThreads[] = { divUp(left.cols, BLOCK_W) *BLOCK_W,
+    size_t globalThreads[] = { left.cols,
                                divUp(left.rows, 2 * ROWSperTHREAD),
-                               1
-                             };
+                               1 };
 
     size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
 
diff --git a/modules/ocl/src/stereobp.cpp b/modules/ocl/src/stereobp.cpp
index fa534893c..9a0fe18df 100644
--- a/modules/ocl/src/stereobp.cpp
+++ b/modules/ocl/src/stereobp.cpp
@@ -103,10 +103,7 @@ namespace cv
             {
                 openCLFree(cl_con_struct);
             }
-            static inline int divUp(int total, int grain)
-            {
-                return (total + grain - 1) / grain;
-            }
+
             /////////////////////////////////////////////////////////////////////////////
             ///////////////////////////comp data////////////////////////////////////////
             /////////////////////////////////////////////////////////////////////////
diff --git a/modules/ocl/src/svm.cpp b/modules/ocl/src/svm.cpp
new file mode 100644
index 000000000..70aaa7ab8
--- /dev/null
+++ b/modules/ocl/src/svm.cpp
@@ -0,0 +1,1201 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Erping Pang, erping@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+using namespace cv;
+using namespace ocl;
+
+#if 1
+typedef float Qfloat;
+#define QFLOAT_TYPE CV_32F
+#else
+typedef double Qfloat;
+#define QFLOAT_TYPE CV_64F
+#endif
+
+namespace cv
+{
+namespace ocl
+{
+///////////////////////////OpenCL kernel strings///////////////////////////
+extern const char *svm;
+}
+}
+class CvSVMKernel_ocl: public CvSVMKernel
+{
+public:
+    typedef void (CvSVMKernel_ocl::*Calc_ocl)( int vec_count, const int row_idx, Qfloat* results, Mat& src);
+    CvSVMKernel_ocl(const CvSVMParams* params, Calc_ocl _calc_func , Calc _calc_func1);
+
+    Calc_ocl calc_func_ocl;
+    bool create( const CvSVMParams* params, Calc_ocl _calc_func, Calc _calc_func1);
+
+    void calc( int vcount, const int row_idx, Qfloat* results, Mat& src);
+    void calc_linear( int vec_count, const int row_idx, Qfloat* results, Mat& src);
+
+    void calc_poly( int vec_count, const int row_idx, Qfloat* results, Mat& src);
+    void calc_sigmoid( int vec_count, const int row_idx, Qfloat* results, Mat& src);
+    void calc_non_rbf_base( int vec_count, const int row_idx, Qfloat* results, Mat& src);
+    void calc_rbf( int vec_count, const int row_idx, Qfloat* results, Mat& src);
+};
+class CvSVMSolver_ocl: public CvSVMSolver
+{
+public:
+    CvSVMSolver_ocl();
+    CvSVMSolver_ocl(const CvSVMParams *);
+    float* get_row_base( int i, bool* _existed, Mat& src);
+    bool solve_generic( CvSVMSolutionInfo& si );
+    float* get_row( int i, float* dst, Mat& src);
+};
+
+typedef struct CvSparseVecElem32f
+{
+    int idx;
+    float val;
+} CvSparseVecElem32f;
+static int icvCmpSparseVecElems( const void* a, const void* b )
+{
+    return ((CvSparseVecElem32f*)a)->idx - ((CvSparseVecElem32f*)b)->idx;
+}
+void cvPreparePredictData( const CvArr* sample, int dims_all, const CvMat* comp_idx,
+                           int class_count, const CvMat* prob, float** row_sample,
+                           int as_sparse CV_DEFAULT(0) );
+void  cvPreparePredictData( const CvArr* _sample, int dims_all,
+                            const CvMat* comp_idx, int class_count,
+                            const CvMat* prob, float** _row_sample,
+                            int as_sparse )
+{
+    float* row_sample = 0;
+    int* inverse_comp_idx = 0;
+
+    CV_FUNCNAME( "cvPreparePredictData" );
+
+    __CV_BEGIN__;
+
+    const CvMat* sample = (const CvMat*)_sample;
+    float* sample_data;
+    int sample_step;
+    int is_sparse = CV_IS_SPARSE_MAT(sample);
+    int d, sizes[CV_MAX_DIM];
+    int i, dims_selected;
+    int vec_size;
+
+    if( !is_sparse && !CV_IS_MAT(sample) )
+    {
+        CV_ERROR( !sample ? CV_StsNullPtr : CV_StsBadArg, "The sample is not a valid vector" );
+    }
+
+    if( cvGetElemType( sample ) != CV_32FC1 )
+    {
+        CV_ERROR( CV_StsUnsupportedFormat, "Input sample must have 32fC1 type" );
+    }
+
+    CV_CALL( d = cvGetDims( sample, sizes ));
+
+    if( !((is_sparse && d == 1) || (!is_sparse && d == 2 && (sample->rows == 1 || sample->cols == 1))) )
+    {
+        CV_ERROR( CV_StsBadSize, "Input sample must be 1-dimensional vector" );
+    }
+
+    if( d == 1 )
+    {
+        sizes[1] = 1;
+    }
+
+    if( sizes[0] + sizes[1] - 1 != dims_all )
+        CV_ERROR( CV_StsUnmatchedSizes,
+                  "The sample size is different from what has been used for training" );
+
+    if( !_row_sample )
+    {
+        CV_ERROR( CV_StsNullPtr, "INTERNAL ERROR: The row_sample pointer is NULL" );
+    }
+
+    if( comp_idx && (!CV_IS_MAT(comp_idx) || comp_idx->rows != 1 ||
+                     CV_MAT_TYPE(comp_idx->type) != CV_32SC1) )
+    {
+        CV_ERROR( CV_StsBadArg, "INTERNAL ERROR: invalid comp_idx" );
+    }
+
+    dims_selected = comp_idx ? comp_idx->cols : dims_all;
+
+    if( prob )
+    {
+        if( !CV_IS_MAT(prob) )
+        {
+            CV_ERROR( CV_StsBadArg, "The output matrix of probabilities is invalid" );
+        }
+
+        if( (prob->rows != 1 && prob->cols != 1) ||
+                (CV_MAT_TYPE(prob->type) != CV_32FC1 &&
+                 CV_MAT_TYPE(prob->type) != CV_64FC1) )
+            CV_ERROR( CV_StsBadSize,
+                      "The matrix of probabilities must be 1-dimensional vector of 32fC1 type" );
+
+        if( prob->rows + prob->cols - 1 != class_count )
+            CV_ERROR( CV_StsUnmatchedSizes,
+                      "The vector of probabilities must contain as many elements as "
+                      "the number of classes in the training set" );
+    }
+
+    vec_size = !as_sparse ? dims_selected * sizeof(row_sample[0]) :
+               (dims_selected + 1) * sizeof(CvSparseVecElem32f);
+
+    if( CV_IS_MAT(sample) )
+    {
+        sample_data = sample->data.fl;
+        sample_step = CV_IS_MAT_CONT(sample->type) ? 1 : sample->step / sizeof(row_sample[0]);
+
+        if( !comp_idx && CV_IS_MAT_CONT(sample->type) && !as_sparse )
+        {
+            *_row_sample = sample_data;
+        }
+        else
+        {
+            CV_CALL( row_sample = (float*)cvAlloc( vec_size ));
+
+            if( !comp_idx )
+                for( i = 0; i < dims_selected; i++ )
+                {
+                    row_sample[i] = sample_data[sample_step * i];
+                }
+            else
+            {
+                int* comp = comp_idx->data.i;
+                for( i = 0; i < dims_selected; i++ )
+                {
+                    row_sample[i] = sample_data[sample_step * comp[i]];
+                }
+            }
+
+            *_row_sample = row_sample;
+        }
+
+        if( as_sparse )
+        {
+            const float* src = (const float*)row_sample;
+            CvSparseVecElem32f* dst = (CvSparseVecElem32f*)row_sample;
+
+            dst[dims_selected].idx = -1;
+            for( i = dims_selected - 1; i >= 0; i-- )
+            {
+                dst[i].idx = i;
+                dst[i].val = src[i];
+            }
+        }
+    }
+    else
+    {
+        CvSparseNode* node;
+        CvSparseMatIterator mat_iterator;
+        const CvSparseMat* sparse = (const CvSparseMat*)sample;
+        assert( is_sparse );
+
+        node = cvInitSparseMatIterator( sparse, &mat_iterator );
+        CV_CALL( row_sample = (float*)cvAlloc( vec_size ));
+
+        if( comp_idx )
+        {
+            CV_CALL( inverse_comp_idx = (int*)cvAlloc( dims_all * sizeof(int) ));
+            memset( inverse_comp_idx, -1, dims_all * sizeof(int) );
+            for( i = 0; i < dims_selected; i++ )
+            {
+                inverse_comp_idx[comp_idx->data.i[i]] = i;
+            }
+        }
+
+        if( !as_sparse )
+        {
+            memset( row_sample, 0, vec_size );
+
+            for( ; node != 0; node = cvGetNextSparseNode(&mat_iterator) )
+            {
+                int idx = *CV_NODE_IDX( sparse, node );
+                if( inverse_comp_idx )
+                {
+                    idx = inverse_comp_idx[idx];
+                    if( idx < 0 )
+                    {
+                        continue;
+                    }
+                }
+                row_sample[idx] = *(float*)CV_NODE_VAL( sparse, node );
+            }
+        }
+        else
+        {
+            CvSparseVecElem32f* ptr = (CvSparseVecElem32f*)row_sample;
+
+            for( ; node != 0; node = cvGetNextSparseNode(&mat_iterator) )
+            {
+                int idx = *CV_NODE_IDX( sparse, node );
+                if( inverse_comp_idx )
+                {
+                    idx = inverse_comp_idx[idx];
+                    if( idx < 0 )
+                    {
+                        continue;
+                    }
+                }
+                ptr->idx = idx;
+                ptr->val = *(float*)CV_NODE_VAL( sparse, node );
+                ptr++;
+            }
+
+            qsort( row_sample, ptr - (CvSparseVecElem32f*)row_sample,
+                   sizeof(ptr[0]), icvCmpSparseVecElems );
+            ptr->idx = -1;
+        }
+
+        *_row_sample = row_sample;
+    }
+
+    __CV_END__;
+
+    if( inverse_comp_idx )
+    {
+        cvFree( &inverse_comp_idx );
+    }
+
+    if( cvGetErrStatus() < 0 && _row_sample )
+    {
+        cvFree( &row_sample );
+        *_row_sample = 0;
+    }
+}
+float CvSVM_OCL::predict( const int row_index, int row_len, Mat& src, bool returnDFVal ) const
+{
+    assert( kernel );
+
+    (void)row_len;
+
+    int class_count = class_labels ? class_labels->cols :
+                      params.svm_type == ONE_CLASS ? 1 : 0;
+
+    float result = 0;
+    cv::AutoBuffer<float> _buffer(sv_total + (class_count + 1) * 2);
+    float* buffer = _buffer;
+
+    if( params.svm_type == EPS_SVR ||
+            params.svm_type == NU_SVR ||
+            params.svm_type == ONE_CLASS )
+    {
+        CvSVMDecisionFunc* df = (CvSVMDecisionFunc*)decision_func;
+        int i, sv_count = df->sv_count;
+        double sum = -df->rho;
+
+        ((CvSVMKernel_ocl*)kernel)->calc( sv_count, row_index, buffer, src);
+        for( i = 0; i < sv_count; i++ )
+        {
+            sum += buffer[i] * df->alpha[i];
+        }
+
+        result = params.svm_type == ONE_CLASS ? (float)(sum > 0) : (float)sum;
+    }
+    else if( params.svm_type == C_SVC ||
+             params.svm_type == NU_SVC )
+    {
+        CvSVMDecisionFunc* df = (CvSVMDecisionFunc*)decision_func;
+        int* vote = (int*)(buffer + sv_total);
+        int i, j, k;
+
+        memset( vote, 0, class_count * sizeof(vote[0]));
+        ((CvSVMKernel_ocl*)kernel)->calc( sv_total, row_index, buffer, src);
+        double sum = 0.;
+
+        for( i = 0; i < class_count; i++ )
+        {
+            for( j = i + 1; j < class_count; j++, df++ )
+            {
+                sum = -df->rho;
+                int sv_count = df->sv_count;
+                for( k = 0; k < sv_count; k++ )
+                {
+                    sum += df->alpha[k] * buffer[df->sv_index[k]];
+                }
+
+                vote[sum > 0 ? i : j]++;
+            }
+        }
+
+        for( i = 1, k = 0; i < class_count; i++ )
+        {
+            if( vote[i] > vote[k] )
+            {
+                k = i;
+            }
+        }
+        result = returnDFVal && class_count == 2 ? (float)sum : (float)(class_labels->data.i[k]);
+    }
+    else
+        CV_Error( CV_StsBadArg, "INTERNAL ERROR: Unknown SVM type, "
+                  "the SVM structure is probably corrupted" );
+
+    return result;
+}
+float CvSVM_OCL::predict( const Mat& _sample, bool returnDFVal ) const
+{
+    CvMat sample = _sample;
+    return CvSVM::predict(&sample, returnDFVal);
+}
+float CvSVM_OCL::predict( const int row_index, Mat& src, bool returnDFVal) const
+{
+    float result = 0;
+
+    result = predict( row_index, get_var_count(), src, returnDFVal);
+
+    return result;
+}
+#undef get_C
+#define get_C(i) (C[y[i]>0])
+#undef is_upper_bound
+#define is_upper_bound(i) (alpha_status[i] > 0)
+#undef is_lower_bound
+#define is_lower_bound(i) (alpha_status[i] < 0)
+#undef update_alpha_status
+#define update_alpha_status(i) \
+    alpha_status[i] = (schar)(alpha[i] >= get_C(i) ? 1 : alpha[i] <= 0 ? -1 : 0)
+
+CvSVMSolver_ocl::CvSVMSolver_ocl(const CvSVMParams* _params)
+{
+    params = _params;
+}
+float* CvSVMSolver_ocl::get_row( int i, float* dst, Mat& src )
+{
+    bool existed = false;
+    float* row = get_row_base( i, &existed, src);
+    return (this->*get_row_func)( i, row, dst, existed );
+}
+float* CvSVMSolver_ocl::get_row_base( int i, bool* _existed, Mat& src )
+{
+    int i1 = i < sample_count ? i : i - sample_count;
+    CvSVMKernelRow* row = rows + i1;
+    bool existed = row->data != 0;
+    Qfloat* data;
+
+    if( existed || cache_size <= 0 )
+    {
+        CvSVMKernelRow* del_row = existed ? row : lru_list.prev;
+        data = del_row->data;
+        assert( data != 0 );
+
+        // delete row from the LRU list
+        del_row->data = 0;
+        del_row->prev->next = del_row->next;
+        del_row->next->prev = del_row->prev;
+    }
+    else
+    {
+        data = (Qfloat*)cvMemStorageAlloc( storage, cache_line_size );
+        cache_size -= cache_line_size;
+    }
+
+    // insert row into the LRU list
+    row->data = data;
+    row->prev = &lru_list;
+    row->next = lru_list.next;
+    row->prev->next = row->next->prev = row;
+
+    if( !existed )
+    {
+        ((CvSVMKernel_ocl*)kernel)->calc( sample_count, i1, row->data, src);
+    }
+
+    if( _existed )
+    {
+        *_existed = existed;
+    }
+
+    return row->data;
+}
+
+#ifndef HAVE_CLAMDBLAS
+static void matmul_sigmod(oclMat & src, oclMat & src2, oclMat & dst, int src_rows, int src2_cols, int var_count, double alpha1, double beta1)
+{
+    Context *clCxt = Context::getContext();
+    String kernelName = "svm_sigmod";
+    int src_step = (int)src.step / src.elemSize();
+    int src2_step = (int)src2.step / src2.elemSize();
+    int dst_step = (int)dst.step / dst.elemSize();
+    int x = MIN(16, src_rows);
+    int y = MIN(16, src2_cols);
+    size_t localThreads[] = {x, y, 1};
+    size_t globalThreads[] = {src2_cols, src_rows, 1};
+    int width = var_count;
+
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src2.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
+
+    float alpha = 0.0f, beta = 0.0f;
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        alpha = (float)alpha1;
+        beta = (float)beta1;
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&alpha));
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&beta));
+    }
+    else
+    {
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&alpha1));
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&beta1));
+    }
+    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+static void matmul_poly(oclMat & src, oclMat & src2, oclMat & dst, int src_rows, int src2_cols, int var_count, double alpha1, double beta1, double degree1, bool flag)
+{
+    Context *clCxt = Context::getContext();
+    String kernelName = "svm_poly";
+    int src_step = (int)src.step / src.elemSize();
+    int src2_step = (int)src2.step / src2.elemSize();
+    int dst_step = (int)dst.step / dst.elemSize();
+    int x = MIN(16, src_rows);
+    int y = MIN(16, src2_cols);
+    size_t localThreads[] = {x, y, 1};
+    size_t globalThreads[] = {src2_cols, src_rows, 1};
+    int width = var_count;
+
+    char build_options[50];
+
+    if(flag)
+    {
+        sprintf(build_options, "-D ADDPOW");
+    }
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src2.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
+
+    float alpha = 0.0f, beta = 0.0f, degree = 0.0f;
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        alpha = (float)alpha1;
+        beta = (float)beta1;
+        degree = (float)degree1;
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&alpha));
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&beta));
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&degree));
+    }
+    else
+    {
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&alpha1));
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&beta1));
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&degree1));
+    }
+    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+}
+static void matmul_linear(oclMat & src, oclMat & src2, oclMat & dst, int src_rows, int src2_cols, int var_count, double alpha1, double beta1)
+{
+    Context *clCxt = Context::getContext();
+    String kernelName = "svm_linear";
+    int src_step = (int)src.step / src.elemSize();
+    int src2_step = (int)src2.step / src2.elemSize();
+    int dst_step = (int)dst.step / dst.elemSize();
+    int x = MIN(16, src_rows);
+    int y = MIN(16, src2_cols);
+    size_t localThreads[] = {x, y, 1};
+    size_t globalThreads[] = {src2_cols, src_rows, 1};
+    int width = var_count;
+
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src2.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
+
+    float alpha = 0.0f, beta = 0.0f;
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        alpha = (float)alpha1;
+        beta = (float)beta1;
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&alpha));
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&beta));
+    }
+    else
+    {
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&alpha1));
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&beta1));
+    }
+    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+#endif // #ifndef HAVE_CLAMDBLAS
+
+static void matmul_rbf(oclMat& src, oclMat& src_e, oclMat& dst, int src_rows, int src2_cols, int var_count, double gamma1, bool flag)
+{
+
+    Context *clCxt = Context::getContext();
+
+    String kernelName = "svm_rbf";
+
+    int width = var_count;
+    int src_step = (int)src.step / src.elemSize();
+    int src_e_step = (int)src_e.step / src_e.elemSize();
+    int dst_step = (int)dst.step / dst.elemSize();
+
+    int x = MIN(16, src_rows);
+    int y = MIN(16, src2_cols);
+    size_t localThreads[] = {x, y, 1};
+    size_t globalThreads[] = {src2_cols,  src_rows, 1};
+    char build_options[50];
+
+    if(flag)
+    {
+        sprintf(build_options, "-D ADDEXP");
+    }
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src_e.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_e_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
+    float gamma = 0.0f;
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        gamma = (float)gamma1;
+        args.push_back(std::make_pair(sizeof(cl_float), (void* )&gamma));
+    }
+    else
+    {
+        args.push_back(std::make_pair(sizeof(cl_double), (void* )&gamma1));
+    }
+
+    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+}
+
+float CvSVM_OCL::predict(const CvMat* samples, CV_OUT CvMat* results) const
+{
+    int var_count = get_var_count();
+    int sample_count = samples->rows;
+
+    //float* row_sample = 0;
+    Mat src_temp = Mat(sample_count, var_count, CV_32FC1);
+    CV_FUNCNAME( "CvSVM::predict" );
+
+
+    for(int i = 0; i < samples->rows; i++)
+    {
+        __CV_BEGIN__;
+        CvMat sample;
+        float* row_sample = 0;
+        cvGetRow( samples, &sample, i );
+        int class_count;
+        if( !kernel )
+        {
+            CV_ERROR( CV_StsBadArg, "The SVM should be trained first" );
+        }
+
+        class_count = class_labels ? class_labels->cols :
+                      params.svm_type == ONE_CLASS ? 1 : 0;
+
+        CV_CALL( cvPreparePredictData(&sample, var_all, var_idx,
+                                      class_count, 0, &row_sample ));
+        for(int j = 0; j < var_count; ++j)
+        {
+            src_temp.at<float>(i, j) = row_sample[j];
+        }
+        __CV_END__;
+    }
+
+    Mat dst1;
+    double alpha1 = 0.0, beta1 = 0.0, gamma1 = 0.0, degree1 = 0.0;
+    if(params.kernel_type == CvSVM::LINEAR)
+    {
+        alpha1 = 1;
+        beta1 = 0;
+    }
+    if(params.kernel_type == CvSVM::POLY)
+    {
+        alpha1 = params.gamma;
+        beta1 = params.coef0;
+        degree1 = params.degree;
+    }
+    if(params.kernel_type == CvSVM::SIGMOID)
+    {
+        alpha1 = - 2 * params.gamma;
+        beta1 = - 2 * params.coef0;
+    }
+    if(params.kernel_type == CvSVM::RBF)
+    {
+        gamma1 = - params.gamma;
+    }
+
+    Mat sv_temp = Mat(sv_total, var_count, CV_32FC1, Scalar::all(0));
+
+
+    for(int i = 0; i < sv_total; ++i)
+    {
+        for(int j = 0; j < var_count; ++j)
+        {
+            sv_temp.at<float>(i, j) = sv[i][j];
+        }
+    }
+    oclMat src(sample_count, var_count, CV_32FC1, Scalar::all(0));
+    oclMat sv_;
+
+    src.upload(src_temp);
+    oclMat dst;
+
+#if defined HAVE_CLAMDBLAS
+
+    dst = oclMat(sample_count, sv_total, CV_32FC1);
+    oclMat src3(sample_count, sv_total, CV_32FC1, Scalar::all(1));
+    if(params.kernel_type != CvSVM::RBF)
+    {
+        Mat sv_temp1;
+        transpose(sv_temp, sv_temp1);
+        sv_.upload(sv_temp1);
+        gemm(src, sv_, alpha1, src3, beta1, dst);
+    }
+
+#else
+
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        dst = oclMat(sample_count, sv_total, CV_32FC1);
+    }
+    else
+    {
+        dst = oclMat(sample_count, sv_total, CV_64FC1);
+    }
+    if(params.kernel_type == CvSVM::LINEAR)
+    {
+        sv_.upload(sv_temp);
+        matmul_linear(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1);
+    }
+    if( params.kernel_type == CvSVM::SIGMOID)
+    {
+        sv_.upload(sv_temp);
+        matmul_sigmod(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1);
+    }
+
+    if(params.kernel_type == CvSVM::POLY)
+    {
+        sv_.upload(sv_temp);
+        if(sample_count > 0)
+        {
+            matmul_poly(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1, degree1, true);
+        }
+        else
+        {
+            matmul_poly(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1, degree1, false);
+        }
+    }
+#endif
+
+    if(params.kernel_type == CvSVM::RBF)
+    {
+        sv_.upload(sv_temp);
+        if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        {
+            dst = oclMat(sample_count, sv_total, CV_32FC1);
+        }
+        else
+        {
+            dst = oclMat(sample_count, sv_total, CV_64FC1);
+        }
+        if(sample_count > 0)
+        {
+            matmul_rbf(src, sv_, dst, sample_count, sv_total, var_count, gamma1, true);
+        }
+        else
+        {
+            matmul_rbf(src, sv_, dst, sample_count, sv_total, var_count, gamma1, false);
+        }
+    }
+    dst.download(dst1);
+
+    float result = 0;
+    for(int i = 0; i < samples->rows; i++ )
+    {
+        int r = (int)this->predict(i, dst1);
+        if (results)
+        {
+            results->data.fl[i] = (float)r;
+        }
+        if (i == 0)
+        {
+            result = (float)r;
+        }
+    }
+    return result;
+}
+void CvSVM_OCL::predict( cv::InputArray _samples, cv::OutputArray _results ) const
+{
+    _results.create(_samples.size().height, 1, CV_32F);
+    CvMat samples = _samples.getMat(), results = _results.getMat();
+    predict(&samples, &results);
+}
+bool CvSVMSolver_ocl::solve_generic( CvSVMSolutionInfo& si )
+{
+    int iter = 0;
+    int i, j, k;
+
+    // 1. initialize gradient and alpha status
+    for( i = 0; i < alpha_count; i++ )
+    {
+        update_alpha_status(i);
+        G[i] = b[i];
+        if( fabs(G[i]) > 1e200 )
+        {
+            return false;
+        }
+    }
+    Mat dst1;
+    double alpha1 = 0.0, beta1 = 0.0, gamma1 = 0.0, degree1 = 0.0;
+    if(params->kernel_type == CvSVM::LINEAR)
+    {
+        alpha1 = 1;
+        beta1 = 0;
+    }
+    if(params->kernel_type == CvSVM::POLY)
+    {
+        alpha1 = params->gamma;
+        beta1 = params->coef0;
+        degree1 = params->degree;
+    }
+    if(params->kernel_type == CvSVM::SIGMOID)
+    {
+        alpha1 = -2 * params->gamma;
+        beta1 = -2 * params->coef0;
+    }
+    if(params->kernel_type == CvSVM::RBF)
+    {
+        gamma1 = -params->gamma;
+    }
+    Mat src1 = Mat(sample_count, var_count, CV_32FC1);
+
+    for(int i = 0; i < sample_count; ++i)
+    {
+        for(int j = 0; j < var_count; ++j)
+        {
+            src1.at<float>(i, j) = samples[i][j];
+        }
+    }
+    oclMat src, src_e;
+    src.upload(src1);
+    oclMat dst;
+
+#if defined HAVE_CLAMDBLAS
+
+    dst = oclMat(sample_count, sample_count, CV_32FC1);
+    oclMat src3(sample_count, sample_count, CV_32FC1, Scalar::all(1));
+    if(params->kernel_type != CvSVM::RBF)
+    {
+        ocl::transpose(src, src_e);
+        gemm(src, src_e, alpha1, src3, beta1, dst);
+    }
+
+#else
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        dst = oclMat(sample_count, sample_count, CV_32FC1);
+    }
+    else
+    {
+        dst = oclMat(sample_count, sample_count, CV_64FC1);
+    }
+    if(params->kernel_type == CvSVM::LINEAR )
+    {
+        src_e = src;
+        matmul_linear(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1);
+    }
+    if( params->kernel_type == CvSVM::SIGMOID)
+    {
+        src_e = src;
+        matmul_sigmod(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1);
+    }
+
+    if(params->kernel_type == CvSVM::POLY)
+    {
+        src_e = src;
+        if(sample_count > 0)
+        {
+            matmul_poly(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1, degree1, true);
+        }
+        else
+        {
+            matmul_poly(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1, degree1, false);
+        }
+    }
+
+#endif
+
+    if(params->kernel_type == CvSVM::RBF)
+    {
+        src_e = src;
+        if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        {
+            dst = oclMat(sample_count, sample_count, CV_32FC1);
+        }
+        else
+        {
+            dst = oclMat(sample_count, sample_count, CV_64FC1);
+        }
+        if(sample_count > 0)
+        {
+            matmul_rbf(src, src_e, dst, sample_count, sample_count, var_count, gamma1, true);
+        }
+        else
+        {
+            matmul_rbf(src, src_e, dst, sample_count, sample_count, var_count, gamma1, false);
+        }
+    }
+    dst.download(dst1);
+    for( i = 0; i < alpha_count; i++ )
+    {
+        if( !is_lower_bound(i) )
+        {
+            const Qfloat *Q_i = CvSVMSolver::get_row( i, buf[0]);
+            double alpha_i = alpha[i];
+
+            for( j = 0; j < alpha_count; j++ )
+            {
+                G[j] += alpha_i * Q_i[j];
+            }
+        }
+    }
+
+    // 2. optimization loop
+    for(;;)
+    {
+        const Qfloat *Q_i, *Q_j;
+        double C_i, C_j;
+        double old_alpha_i, old_alpha_j, alpha_i, alpha_j;
+        double delta_alpha_i, delta_alpha_j;
+
+#ifdef _DEBUG
+        for( i = 0; i < alpha_count; i++ )
+        {
+            if( fabs(G[i]) > 1e+300 )
+            {
+                return false;
+            }
+
+            if( fabs(alpha[i]) > 1e16 )
+            {
+                return false;
+            }
+        }
+#endif
+
+        if( (this->*select_working_set_func)( i, j ) != 0 || iter++ >= max_iter )
+        {
+            break;
+        }
+        Q_i = get_row( i, buf[0], dst1);
+        Q_j = get_row( j, buf[1], dst1);
+
+        C_i = get_C(i);
+        C_j = get_C(j);
+
+        alpha_i = old_alpha_i = alpha[i];
+        alpha_j = old_alpha_j = alpha[j];
+
+        if( y[i] != y[j] )
+        {
+            double denom = Q_i[i] + Q_j[j] + 2 * Q_i[j];
+            double delta = (-G[i] - G[j]) / MAX(fabs(denom), FLT_EPSILON);
+            double diff = alpha_i - alpha_j;
+            alpha_i += delta;
+            alpha_j += delta;
+
+            if( diff > 0 && alpha_j < 0 )
+            {
+                alpha_j = 0;
+                alpha_i = diff;
+            }
+            else if( diff <= 0 && alpha_i < 0 )
+            {
+                alpha_i = 0;
+                alpha_j = -diff;
+            }
+
+            if( diff > C_i - C_j && alpha_i > C_i )
+            {
+                alpha_i = C_i;
+                alpha_j = C_i - diff;
+            }
+            else if( diff <= C_i - C_j && alpha_j > C_j )
+            {
+                alpha_j = C_j;
+                alpha_i = C_j + diff;
+            }
+        }
+        else
+        {
+            double denom = Q_i[i] + Q_j[j] - 2 * Q_i[j];
+            double delta = (G[i] - G[j]) / MAX(fabs(denom), FLT_EPSILON);
+            double sum = alpha_i + alpha_j;
+            alpha_i -= delta;
+            alpha_j += delta;
+
+            if( sum > C_i && alpha_i > C_i )
+            {
+                alpha_i = C_i;
+                alpha_j = sum - C_i;
+            }
+            else if( sum <= C_i && alpha_j < 0)
+            {
+                alpha_j = 0;
+                alpha_i = sum;
+            }
+
+            if( sum > C_j && alpha_j > C_j )
+            {
+                alpha_j = C_j;
+                alpha_i = sum - C_j;
+            }
+            else if( sum <= C_j && alpha_i < 0 )
+            {
+                alpha_i = 0;
+                alpha_j = sum;
+            }
+        }
+        // update alpha
+        alpha[i] = alpha_i;
+        alpha[j] = alpha_j;
+        update_alpha_status(i);
+        update_alpha_status(j);
+
+        // update G
+        delta_alpha_i = alpha_i - old_alpha_i;
+        delta_alpha_j = alpha_j - old_alpha_j;
+
+        for( k = 0; k < alpha_count; k++ )
+        {
+            G[k] += Q_i[k] * delta_alpha_i + Q_j[k] * delta_alpha_j;
+        }
+    }
+
+    // calculate rho
+    (this->*calc_rho_func)( si.rho, si.r );
+
+    // calculate objective value
+    for( i = 0, si.obj = 0; i < alpha_count; i++ )
+    {
+        si.obj += alpha[i] * (G[i] + b[i]);
+    }
+
+    si.obj *= 0.5;
+
+    si.upper_bound_p = C[1];
+    si.upper_bound_n = C[0];
+
+    return true;
+}
+
+void CvSVMKernel_ocl::calc( int vcount, const int row_idx, Qfloat* results, Mat& src)
+{
+    //const Qfloat max_val = (Qfloat)(FLT_MAX*1e-3);
+    //int j;
+    (this->*calc_func_ocl)( vcount, row_idx, results, src);
+
+#if defined HAVE_CLAMDBLAS
+    const Qfloat max_val = (Qfloat)(FLT_MAX * 1e-3);
+    int j;
+    for( j = 0; j < vcount; j++ )
+    {
+        if( results[j] > max_val )
+        {
+            results[j] = max_val;
+        }
+    }
+#endif
+}
+bool CvSVMKernel_ocl::create( const CvSVMParams* _params, Calc_ocl _calc_func, Calc _calc_func1 )
+{
+    clear();
+    params = _params;
+    calc_func_ocl = _calc_func;
+    calc_func = _calc_func1;
+    if( !calc_func_ocl )
+        calc_func_ocl = params->kernel_type == CvSVM::RBF ? &CvSVMKernel_ocl::calc_rbf :
+                        params->kernel_type == CvSVM::POLY ? &CvSVMKernel_ocl::calc_poly :
+                        params->kernel_type == CvSVM::SIGMOID ? &CvSVMKernel_ocl::calc_sigmoid :
+                        &CvSVMKernel_ocl::calc_linear;
+    if( !calc_func)
+        calc_func = params->kernel_type == CvSVM::RBF ? &CvSVMKernel::calc_rbf :
+                    params->kernel_type == CvSVM::POLY ? &CvSVMKernel::calc_poly :
+                    params->kernel_type == CvSVM::SIGMOID ? &CvSVMKernel::calc_sigmoid :
+                    &CvSVMKernel::calc_linear;
+    return true;
+}
+CvSVMKernel_ocl::CvSVMKernel_ocl(const CvSVMParams* params, CvSVMKernel_ocl::Calc_ocl _calc_func, CvSVMKernel::Calc _calc_func1)
+{
+    CvSVMKernel::clear();
+    CvSVMKernel_ocl::create( params, _calc_func, _calc_func1 );
+}
+void CvSVMKernel_ocl::calc_non_rbf_base( int vcount, const int row_idx, Qfloat* results, Mat& src)
+{
+#if defined HAVE_CLAMDBLAS
+
+    for(int i = 0; i < vcount; i++)
+    {
+        results[i] = (Qfloat) * src.ptr<float>(row_idx, i);
+    }
+#else
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        for(int i = 0; i < vcount; i++)
+        {
+            results[i] = (Qfloat) * src.ptr<float>(row_idx, i);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < vcount; i++)
+        {
+            results[i] = (Qfloat) * src.ptr<double>(row_idx, i);
+        }
+    }
+#endif
+}
+void CvSVMKernel_ocl::calc_rbf( int vcount, const int row_idx, Qfloat* results, Mat& src)
+{
+    if(!Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+    {
+        for(int m = 0; m < vcount; m++)
+        {
+            results[m] = (Qfloat) * src.ptr<float>(row_idx, m);
+        }
+    }
+    else
+    {
+        for(int m = 0; m < vcount; m++)
+        {
+            results[m] = (Qfloat) * src.ptr<double>(row_idx, m);
+        }
+    }
+}
+void CvSVMKernel_ocl::calc_linear( int vcount, const int row_idx, Qfloat* results, Mat& src )
+{
+    calc_non_rbf_base( vcount, row_idx, results, src);
+}
+
+void CvSVMKernel_ocl::calc_poly( int vcount, const int row_idx, Qfloat* results, Mat& src)
+{
+
+    calc_non_rbf_base( vcount, row_idx, results, src);
+
+#if defined HAVE_CLAMDBLAS
+
+    CvMat R = cvMat( 1, vcount, QFLOAT_TYPE, results );
+    if( vcount > 0 )
+    {
+        cvPow( &R, &R, params->degree );
+    }
+#endif
+}
+
+
+void CvSVMKernel_ocl::calc_sigmoid( int vcount, const int row_idx, Qfloat* results, Mat& src)
+{
+    calc_non_rbf_base( vcount, row_idx, results, src);
+    // TODO: speedup this
+#if defined HAVE_CLAMDBLAS
+    for(int j = 0; j < vcount; j++ )
+    {
+        Qfloat t = results[j];
+        double e = exp(-fabs(t));
+        if( t > 0 )
+        {
+            results[j] = (Qfloat)((1. - e) / (1. + e));
+        }
+        else
+        {
+            results[j] = (Qfloat)((e - 1.) / (e + 1.));
+        }
+    }
+#endif
+}
+CvSVM_OCL::CvSVM_OCL()
+{
+    CvSVM();
+}
+
+CvSVM_OCL::CvSVM_OCL( const Mat& _train_data, const Mat& _responses,
+                      const Mat& _var_idx, const Mat& _sample_idx, CvSVMParams _params )
+{
+    decision_func = 0;
+    class_labels = 0;
+    class_weights = 0;
+    storage = 0;
+    var_idx = 0;
+    kernel = 0;
+    solver = 0;
+    default_model_name = "my_svm";
+
+    train( _train_data, _responses, _var_idx, _sample_idx, _params );
+}
+
+void CvSVM_OCL::create_kernel()
+{
+    kernel = new CvSVMKernel_ocl(&params, 0, 0);
+}
+void CvSVM_OCL::create_solver( )
+{
+    solver = new CvSVMSolver_ocl(&params);
+}
diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp
index daf3a2295..b9529f305 100644
--- a/modules/ocl/src/tvl1flow.cpp
+++ b/modules/ocl/src/tvl1flow.cpp
@@ -293,14 +293,14 @@ void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy)
     int dx_step = dx.step/dElememntSize;
 
     String kernelName = "centeredGradientKernel";
-    vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&src.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&src.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&src.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&src_step));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&dx.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&dy.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&dx_step));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&src.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&src.cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&src.rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&src_step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&dx.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&dy.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&dx_step));
     openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1);
 
 }
@@ -335,23 +335,23 @@ void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oc
     u2_offset_x = u2_offset_x/u2.elemSize();
 
     String kernelName = "estimateDualVariablesKernel";
-    vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&p11_step));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
-    args.push_back( make_pair( sizeof(cl_float), (void*)&taut));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u1.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1.cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1.rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u2.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p11.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&p11_step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p12.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p21.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p22.data));
+    args.push_back( std::make_pair( sizeof(cl_float), (void*)&taut));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_x));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_y));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_x));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_y));
 
     openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
 }
@@ -389,30 +389,30 @@ void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
     u2_offset_x = u2_offset_x/u2.elemSize();
 
     String kernelName = "estimateUKernel";
-    vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.rows));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx_step));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&rho_c.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&error.data));
-    args.push_back( make_pair( sizeof(cl_float), (void*)&l_t));
-    args.push_back( make_pair( sizeof(cl_float), (void*)&theta));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
-    args.push_back( make_pair( sizeof(cl_char), (void*)&calc_error));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wx.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1wx.cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1wx.rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1wx_step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wy.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&grad.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&rho_c.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p11.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p12.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p21.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p22.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u1.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u2.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&error.data));
+    args.push_back( std::make_pair( sizeof(cl_float), (void*)&l_t));
+    args.push_back( std::make_pair( sizeof(cl_float), (void*)&theta));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_x));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_y));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_x));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_y));
+    args.push_back( std::make_pair( sizeof(cl_char), (void*)&calc_error));
 
     openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
 }
@@ -460,28 +460,28 @@ void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x,
     I1y_tex = bindTexture(I1y);
 
     String kernelName = "warpBackwardKernel";
-    vector< pair<size_t, const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I0.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I0Step));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I0.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I0.rows));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1_tex));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1x_tex));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1y_tex));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1Step));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1w.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
-    args.push_back( make_pair( sizeof(cl_mem), (void*)&rho.data));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&I1w_step));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2Step));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
-    args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I0.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I0Step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I0.cols));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I0.rows));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1_tex));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1x_tex));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1y_tex));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u1.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1Step));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u2.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1w.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wx.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wy.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&grad.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&rho.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1w_step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2Step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_x));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_y));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_x));
+    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_y));
 
     openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
 
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index fa9d09999..9b20dbf89 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -50,10 +50,6 @@
 //
 //M*/
 
-//#define PRINT_CPU_TIME 1000
-//#define PRINT_TIME
-
-
 #include "test_precomp.hpp"
 #include <iomanip>
 
@@ -65,392 +61,506 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
+//////////////////////////////// LUT /////////////////////////////////////////////////
+
+PARAM_TEST_CASE(Lut, int, int, bool, bool)
 {
-    int type;
-    cv::Scalar val;
+    int lut_depth;
+    int cn;
+    bool use_roi, same_cn;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
+    // src mat
+    cv::Mat src;
+    cv::Mat lut;
     cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
+    // src mat with roi
+    cv::Mat src_roi;
+    cv::Mat lut_roi;
     cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
 
-    //ocl dst mat for testing
+    // ocl dst mat for testing
     cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
 
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc;
+    cv::ocl::oclMat glut;
     cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
 
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
+        lut_depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        same_cn = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
+
+        const int src_type = CV_MAKE_TYPE(CV_8U, cn);
+        const int lut_type = CV_MAKE_TYPE(lut_depth, same_cn ? cn : 1);
+        const int dst_type = CV_MAKE_TYPE(lut_depth, cn);
 
         cv::RNG &rng = TS::ptr()->get_rng();
 
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        //mat2 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-
+        src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), src_type, 0, 256, false);
+        lut = randomMat(rng, use_roi ? randomSize(260, 300) : Size(256, 1), lut_type, 5, 16, false);
+        dst = randomMat(rng, use_roi ? randomSize(MIN_VALUE, MAX_VALUE) : src.size(), dst_type, 5, 16, false);
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-#else
-        roicols = mat1.cols;
-        roirows = mat1.rows;
-        src1x = 0;
-        src1y = 0;
-        dstx = 0;
-        dsty = 0;
-        maskx   = 0;
-        masky   = 0;
-        src2x   = 0;
-        src2y   = 0;
-#endif
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
+        // set up roi
+        int roicols, roirows;
+        int srcx, srcy;
+        int lutx, luty;
+        int dstx, dsty;
+
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+
+            srcx = rng.uniform(0, src.cols - roicols);
+            srcy = rng.uniform(0, src.rows - roirows);
+            lutx = rng.uniform(0, lut.cols - 256);
+            luty = rng.uniform(0, lut.rows - 1);
+
+            dstx = rng.uniform(0, dst.cols - roicols);
+            dsty = rng.uniform(0, dst.rows - roirows);
+        }
+        else
+        {
+            roicols = src.cols;
+            roirows = src.rows;
+            srcx = srcy = 0;
+            lutx = luty = 0;
+            dstx = dsty = 0;
+        }
+
+        src_roi = src(Rect(srcx, srcy, roicols, roirows));
+        lut_roi = lut(Rect(lutx, luty, 256, 1));
+        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
 
         gdst_whole = dst;
         gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi; //end
+        gsrc = src_roi;
+        glut = lut_roi;
     }
 
     void Near(double threshold = 0.)
     {
         EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
+        EXPECT_MAT_NEAR(dst_roi, Mat(gdst), threshold);
+    }
+};
+
+TEST_P(Lut, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::LUT(src_roi, lut_roi, dst_roi);
+        cv::ocl::LUT(gsrc, glut, gdst);
+
+        Near();
+    }
+}
+
+///////////////////////// ArithmTestBase ///////////////////////////
+
+PARAM_TEST_CASE(ArithmTestBase, int, int, bool)
+{
+    int depth;
+    int cn;
+    bool use_roi;
+    cv::Scalar val;
+
+    // src mat
+    cv::Mat src1;
+    cv::Mat src2;
+    cv::Mat mask;
+    cv::Mat dst1;
+    cv::Mat dst2; // for two outputs
+
+    // set up roi
+    int roicols, roirows;
+    int src1x, src1y;
+    int src2x, src2y;
+    int dst1x, dst1y;
+    int dst2x, dst2y;
+    int maskx, masky;
+
+    // src mat with roi
+    cv::Mat src1_roi;
+    cv::Mat src2_roi;
+    cv::Mat mask_roi;
+    cv::Mat dst1_roi;
+    cv::Mat dst2_roi; // for two outputs
+
+    // ocl dst mat for testing
+    cv::ocl::oclMat gdst1_whole;
+    cv::ocl::oclMat gdst2_whole; // for two outputs
+
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc1;
+    cv::ocl::oclMat gsrc2;
+    cv::ocl::oclMat gdst1;
+    cv::ocl::oclMat gdst2; // for two outputs
+    cv::ocl::oclMat gmask;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+        const int type = CV_MAKE_TYPE(depth, cn);
+
+        cv::RNG &rng = TS::ptr()->get_rng();
+
+        src1 = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        src2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, -15440, 14450, false);
+        dst1 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        dst2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        mask = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), CV_8UC1, 0, 2, false);
+
+        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+        val = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
+                         rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
+    }
+
+    void random_roi()
+    {
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+
+            src1x = rng.uniform(0, src1.cols - roicols);
+            src1y = rng.uniform(0, src1.rows - roirows);
+            src2x = rng.uniform(0, src2.cols - roicols);
+            src2y = rng.uniform(0, src2.rows - roirows);
+
+            dst1x = rng.uniform(0, dst1.cols - roicols);
+            dst1y = rng.uniform(0, dst1.rows - roirows);
+            dst2x = rng.uniform(0, dst2.cols - roicols);
+            dst2y = rng.uniform(0, dst2.rows - roirows);
+
+            maskx = rng.uniform(0, mask.cols - roicols);
+            masky = rng.uniform(0, mask.rows - roirows);
+        }
+        else
+        {
+            roicols = src1.cols;
+            roirows = src1.rows;
+            src1x = src1y = 0;
+            src2x = src2y = 0;
+            dst1x = dst1y = 0;
+            dst2x = dst2y = 0;
+            maskx = masky = 0;
+        }
+
+        src1_roi = src1(Rect(src1x, src1y, roicols, roirows));
+        src2_roi = src2(Rect(src2x, src2y, roicols, roirows));
+        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
+        dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
+        dst2_roi = dst2(Rect(dst2x, dst2y, roicols, roirows));
+
+        gdst1_whole = dst1;
+        gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
+
+        gdst2_whole = dst2;
+        gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
+
+        gsrc1 = src1_roi;
+        gsrc2 = src2_roi;
+        gmask = mask_roi;
+    }
+
+    void Near(double threshold = 0.)
+    {
+        EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
+        EXPECT_MAT_NEAR(dst1_roi, Mat(gdst1), threshold);
     }
 
     void Near1(double threshold = 0.)
     {
-        EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
+        EXPECT_MAT_NEAR(dst2, Mat(gdst2_whole), threshold);
+        EXPECT_MAT_NEAR(dst2_roi, Mat(gdst2), threshold);
     }
-
 };
-////////////////////////////////lut/////////////////////////////////////////////////
-struct Lut : ArithmTestBase {};
-#define VARNAME(A) string(#A);
 
+//////////////////////////////// Exp /////////////////////////////////////////////////
 
-TEST_P(Lut, Mat)
-{
-
-    cv::Mat mat2(3, 512, CV_8UC1);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(256));
-
-    for(int j = 0; j < LOOP_TIMES; j ++)
-    {
-        random_roi();
-
-        src2x = rng.uniform( 0, mat2.cols - 256);
-        src2y = rng.uniform (0, mat2.rows - 1);
-
-        cv::Mat mat2_roi = mat2(Rect(src2x, src2y, 256, 1));
-
-        cv::ocl::oclMat gmat2(mat2_roi);
-
-        cv::LUT(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::LUT(gmat1, gmat2, gdst);
-        Near(0);
-    }
-}
-
-
-////////////////////////////////exp/////////////////////////////////////////////////
-struct Exp : ArithmTestBase {};
+typedef ArithmTestBase Exp;
 
 TEST_P(Exp, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::exp(mat1_roi, dst_roi);
-        cv::ocl::exp(gmat1, gdst);
+        cv::exp(src1_roi, dst1_roi);
+        cv::ocl::exp(gsrc1, gdst1);
+
         Near(2);
     }
 }
 
+//////////////////////////////// Log /////////////////////////////////////////////////
 
-////////////////////////////////log/////////////////////////////////////////////////
-struct Log : ArithmTestBase {};
+typedef ArithmTestBase Log;
 
 TEST_P(Log, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::log(mat1_roi, dst_roi);
-        cv::ocl::log(gmat1, gdst);
+        cv::log(src1_roi, dst1_roi);
+        cv::ocl::log(gsrc1, gdst1);
         Near(1);
     }
 }
 
+//////////////////////////////// Add /////////////////////////////////////////////////
 
-////////////////////////////////add/////////////////////////////////////////////////
-struct Add : ArithmTestBase {};
+typedef ArithmTestBase Add;
 
 TEST_P(Add, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::add(gmat1, gmat2, gdst);
+        cv::add(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::add(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Add, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::add(gmat1, gmat2, gdst, gmask);
+        cv::add(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::add(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Add, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, val, dst_roi);
-        cv::ocl::add(gmat1, val, gdst);
+        cv::add(src1_roi, val, dst1_roi);
+        cv::ocl::add(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Add, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::add(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::add(gmat1, val, gdst, gmask);
+        cv::add(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::add(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Sub /////////////////////////////////////////////////
 
-
-////////////////////////////////sub/////////////////////////////////////////////////
-struct Sub : ArithmTestBase {};
+typedef ArithmTestBase Sub;
 
 TEST_P(Sub, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::subtract(gmat1, gmat2, gdst);
+        cv::subtract(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::subtract(gsrc1, gsrc2, gdst1);
+
         Near(0);
     }
 }
 
 TEST_P(Sub, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
+        cv::subtract(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::subtract(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Sub, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, val, dst_roi);
-        cv::ocl::subtract(gmat1, val, gdst);
+        cv::subtract(src1_roi, val, dst1_roi);
+        cv::ocl::subtract(gsrc1, val, gdst1);
+
         Near(1e-5);
     }
 }
 
 TEST_P(Sub, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::subtract(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::subtract(gmat1, val, gdst, gmask);
+        cv::subtract(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::subtract(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Mul /////////////////////////////////////////////////
 
-
-////////////////////////////////Mul/////////////////////////////////////////////////
-struct Mul : ArithmTestBase {};
+typedef ArithmTestBase Mul;
 
 TEST_P(Mul, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::multiply(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::multiply(gmat1, gmat2, gdst);
+        cv::multiply(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::multiply(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
-TEST_P(Mul, Mat_Scalar)
+TEST_P(Mul, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
+        cv::multiply(val[0], src1_roi, dst1_roi);
+        cv::ocl::multiply(val[0], gsrc1, gdst1);
 
-        cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
-        cv::ocl::multiply(gmat1, gmat2, gdst, s);
-        Near(.001);
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
     }
 }
 
 
-
-struct Div : ArithmTestBase {};
-
-TEST_P(Div, Mat)
+TEST_P(Mul, Mat_Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::divide(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::divide(gmat1, gmat2, gdst);
+        cv::multiply(src1_roi, src2_roi, dst1_roi, val[0]);
+        cv::ocl::multiply(gsrc1, gsrc2, gdst1, val[0]);
+
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
+    }
+}
+
+//////////////////////////////// Div /////////////////////////////////////////////////
+
+typedef ArithmTestBase Div;
+
+TEST_P(Div, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::divide(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::divide(gsrc1, gsrc2, gdst1);
         Near(1);
     }
 }
 
-TEST_P(Div, Mat_Scalar)
+TEST_P(Div, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
+        cv::divide(val[0], src1_roi, dst1_roi);
+        cv::ocl::divide(val[0], gsrc1, gdst1);
 
-        cv::divide(mat1_roi, mat2_roi, dst_roi, s);
-        cv::ocl::divide(gmat1, gmat2, gdst, s);
-        Near(.001);
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
     }
 }
 
 
-struct Absdiff : ArithmTestBase {};
-
-TEST_P(Absdiff, Mat)
+TEST_P(Div, Mat_Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::absdiff(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::absdiff(gmat1, gmat2, gdst);
+        cv::divide(src1_roi, src2_roi, dst1_roi, val[0]);
+        cv::ocl::divide(gsrc1, gsrc2, gdst1, val[0]);
+
+        Near(gdst1.depth() >= CV_32F ? 1e-3 : 1);
+    }
+}
+
+//////////////////////////////// Absdiff /////////////////////////////////////////////////
+
+typedef ArithmTestBase Absdiff;
+
+TEST_P(Absdiff, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::absdiff(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::absdiff(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Absdiff, Mat_Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::absdiff(mat1_roi, val, dst_roi);
-        cv::ocl::absdiff(gmat1, val, gdst);
+        cv::absdiff(src1_roi, val, dst1_roi);
+        cv::ocl::absdiff(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// CartToPolar /////////////////////////////////////////////////
 
-
-struct CartToPolar : ArithmTestBase {};
+typedef ArithmTestBase CartToPolar;
 
 TEST_P(CartToPolar, angleInDegree)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
+        cv::cartToPolar(src1_roi, src2_roi, dst1_roi, dst2_roi, true);
+        cv::ocl::cartToPolar(gsrc1, gsrc2, gdst1, gdst2, true);
         Near(.5);
         Near1(.5);
     }
@@ -458,28 +568,30 @@ TEST_P(CartToPolar, angleInDegree)
 
 TEST_P(CartToPolar, angleInRadians)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
+        cv::cartToPolar(src1_roi, src2_roi, dst1_roi, dst2_roi);
+        cv::ocl::cartToPolar(gsrc1, gsrc2, gdst1, gdst2);
         Near(.5);
         Near1(.5);
     }
 }
 
+//////////////////////////////// PolarToCart /////////////////////////////////////////////////
 
-struct PolarToCart : ArithmTestBase {};
+typedef ArithmTestBase PolarToCart;
 
 TEST_P(PolarToCart, angleInDegree)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
+        cv::polarToCart(src1_roi, src2_roi, dst1_roi, dst2_roi, true);
+        cv::ocl::polarToCart(gsrc1, gsrc2, gdst1, gdst2, true);
+
         Near(.5);
         Near1(.5);
     }
@@ -487,144 +599,179 @@ TEST_P(PolarToCart, angleInDegree)
 
 TEST_P(PolarToCart, angleInRadians)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
+        cv::polarToCart(src1_roi, src2_roi, dst1_roi, dst2_roi);
+        cv::ocl::polarToCart(gsrc1, gsrc2, gdst1, gdst2);
+
         Near(.5);
         Near1(.5);
     }
 }
 
+//////////////////////////////// Magnitude /////////////////////////////////////////////////
 
-
-
-struct Magnitude : ArithmTestBase {};
+typedef ArithmTestBase Magnitude;
 
 TEST_P(Magnitude, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::magnitude(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::magnitude(gmat1, gmat2, gdst);
-        Near(1e-5);
+        cv::magnitude(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::magnitude(gsrc1, gsrc2, gdst1);
+        Near(depth == CV_64F ? 1e-5 : 1e-2);
     }
 }
 
+//////////////////////////////// Transpose /////////////////////////////////////////////////
 
-struct Transpose : ArithmTestBase {};
+typedef ArithmTestBase Transpose;
 
 TEST_P(Transpose, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::transpose(mat1_roi, dst_roi);
-        cv::ocl::transpose(gmat1, gdst);
+        cv::transpose(src1_roi, dst1_roi);
+        cv::ocl::transpose(gsrc1, gdst1);
+
         Near(1e-5);
     }
 }
 
+TEST_P(Transpose, SquareInplace)
+{
+    cv::RNG &rng = TS::ptr()->get_rng();
+    int value = randomInt(MIN_VALUE, MAX_VALUE);
+    src1 = randomMat(rng, Size(value, value), CV_MAKE_TYPE(depth, cn), 5, 16, false);
 
-struct Flip : ArithmTestBase {};
+    if (use_roi)
+    {
+        roirows = roicols = randomInt(1, src1.cols);
+
+        src1x = randomInt(0, src1.cols - roicols);
+        src1y = randomInt(0, src1.rows - roirows);
+    }
+    else
+    {
+        roicols = roirows = src1.cols;
+        src1x = src1y = 0;
+    }
+
+    Rect r(src1x, src1y, roicols, roirows);
+    src1_roi = src1(r);
+    gdst1_whole = src1;
+    gdst1 = gdst1_whole(r);
+
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        cv::transpose(src1_roi, src1_roi);
+        cv::ocl::transpose(gdst1, gdst1);
+
+        EXPECT_MAT_NEAR(src1, Mat(gdst1_whole), 0.0);
+        EXPECT_MAT_NEAR(src1_roi, Mat(gdst1), 0.0);
+    }
+}
+
+//////////////////////////////// Flip /////////////////////////////////////////////////
+
+typedef ArithmTestBase Flip;
 
 TEST_P(Flip, X)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::flip(mat1_roi, dst_roi, 0);
-        cv::ocl::flip(gmat1, gdst, 0);
+        cv::flip(src1_roi, dst1_roi, 0);
+        cv::ocl::flip(gsrc1, gdst1, 0);
         Near(1e-5);
     }
 }
 
 TEST_P(Flip, Y)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::flip(mat1_roi, dst_roi, 1);
-        cv::ocl::flip(gmat1, gdst, 1);
+        cv::flip(src1_roi, dst1_roi, 1);
+        cv::ocl::flip(gsrc1, gdst1, 1);
         Near(1e-5);
     }
 }
 
 TEST_P(Flip, BOTH)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::flip(mat1_roi, dst_roi, -1);
-        cv::ocl::flip(gmat1, gdst, -1);
+        cv::flip(src1_roi, dst1_roi, -1);
+        cv::ocl::flip(gsrc1, gdst1, -1);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// MinMax /////////////////////////////////////////////////
 
-struct MinMax : ArithmTestBase {};
+typedef ArithmTestBase MinMax;
 
 TEST_P(MinMax, MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        double minVal, maxVal;
-        cv::Point minLoc, maxLoc;
 
-        if (mat1.depth() != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-        }
+        double minVal, maxVal;
+
+        if (src1.depth() != CV_8S)
+            cv::minMaxIdx(src1_roi, &minVal, &maxVal, NULL, NULL);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     if (val < minVal) minVal = val;
-                    if (val > maxVal) maxVal = val;
+                    else if (val > maxVal) maxVal = val;
                 }
         }
 
         double minVal_, maxVal_;
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
+        cv::ocl::minMax(gsrc1, &minVal_, &maxVal_);
 
         EXPECT_DOUBLE_EQ(minVal_, minVal);
         EXPECT_DOUBLE_EQ(maxVal_, maxVal);
     }
 }
 
-TEST_P(MinMax, MASK)
+TEST_P(MinMax, DISABLED_MASK)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
+
         double minVal, maxVal;
         cv::Point minLoc, maxLoc;
 
-        if (mat1.depth() != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-        }
+        if (src1.depth() != CV_8S)
+            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     unsigned char m = mask_roi.at<unsigned char>(i, j);
                     if (val < minVal && m) minVal = val;
                     if (val > maxVal && m) maxVal = val;
@@ -632,36 +779,37 @@ TEST_P(MinMax, MASK)
         }
 
         double minVal_, maxVal_;
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
+        cv::ocl::minMax(gsrc1, &minVal_, &maxVal_, gmask);
 
-        EXPECT_DOUBLE_EQ(minVal_, minVal);
-        EXPECT_DOUBLE_EQ(maxVal_, maxVal);
+        EXPECT_DOUBLE_EQ(minVal, minVal_);
+        EXPECT_DOUBLE_EQ(maxVal, maxVal_);
     }
 }
 
+//////////////////////////////// MinMaxLoc /////////////////////////////////////////////////
 
-struct MinMaxLoc : ArithmTestBase {};
+typedef ArithmTestBase MinMaxLoc;
 
 TEST_P(MinMaxLoc, MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
+
         double minVal, maxVal;
         cv::Point minLoc, maxLoc;
-        int depth = mat1.depth();
+        int depth = src1.depth();
+
         if (depth != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-        }
+            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     if (val < minVal)
                     {
                         minVal = val;
@@ -679,71 +827,71 @@ TEST_P(MinMaxLoc, MAT)
 
         double minVal_, maxVal_;
         cv::Point minLoc_, maxLoc_;
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
+        cv::ocl::minMaxLoc(gsrc1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
 
         double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
-        if(depth == 0)
+        if (depth == 0)
         {
-            minlocVal = mat1_roi.at<unsigned char>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned char>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned char>(minLoc_) - mat1_roi.at<unsigned char>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned char>(maxLoc_) - mat1_roi.at<unsigned char>(maxLoc));
+            minlocVal = src1_roi.at<unsigned char>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned char>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned char>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned char>(minLoc_) - src1_roi.at<unsigned char>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned char>(maxLoc_) - src1_roi.at<unsigned char>(maxLoc));
         }
-        if(depth == 1)
+        if (depth == 1)
         {
-            minlocVal = mat1_roi.at<signed char>(minLoc);
-            minlocVal_ = mat1_roi.at<signed char>(minLoc_);
-            maxlocVal = mat1_roi.at<signed char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed char>(minLoc_) - mat1_roi.at<signed char>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed char>(maxLoc_) - mat1_roi.at<signed char>(maxLoc));
+            minlocVal = src1_roi.at<signed char>(minLoc);
+            minlocVal_ = src1_roi.at<signed char>(minLoc_);
+            maxlocVal = src1_roi.at<signed char>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed char>(minLoc_) - src1_roi.at<signed char>(minLoc));
+            error1 = ::abs(src1_roi.at<signed char>(maxLoc_) - src1_roi.at<signed char>(maxLoc));
         }
-        if(depth == 2)
+        if (depth == 2)
         {
-            minlocVal = mat1_roi.at<unsigned short>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned short>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned short>(minLoc_) - mat1_roi.at<unsigned short>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned short>(maxLoc_) - mat1_roi.at<unsigned short>(maxLoc));
+            minlocVal = src1_roi.at<unsigned short>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned short>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned short>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned short>(minLoc_) - src1_roi.at<unsigned short>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned short>(maxLoc_) - src1_roi.at<unsigned short>(maxLoc));
         }
-        if(depth == 3)
+        if (depth == 3)
         {
-            minlocVal = mat1_roi.at<signed short>(minLoc);
-            minlocVal_ = mat1_roi.at<signed short>(minLoc_);
-            maxlocVal = mat1_roi.at<signed short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed short>(minLoc_) - mat1_roi.at<signed short>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed short>(maxLoc_) - mat1_roi.at<signed short>(maxLoc));
+            minlocVal = src1_roi.at<signed short>(minLoc);
+            minlocVal_ = src1_roi.at<signed short>(minLoc_);
+            maxlocVal = src1_roi.at<signed short>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed short>(minLoc_) - src1_roi.at<signed short>(minLoc));
+            error1 = ::abs(src1_roi.at<signed short>(maxLoc_) - src1_roi.at<signed short>(maxLoc));
         }
-        if(depth == 4)
+        if (depth == 4)
         {
-            minlocVal = mat1_roi.at<int>(minLoc);
-            minlocVal_ = mat1_roi.at<int>(minLoc_);
-            maxlocVal = mat1_roi.at<int>(maxLoc);
-            maxlocVal_ = mat1_roi.at<int>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<int>(minLoc_) - mat1_roi.at<int>(minLoc));
-            error1 = ::abs(mat1_roi.at<int>(maxLoc_) - mat1_roi.at<int>(maxLoc));
+            minlocVal = src1_roi.at<int>(minLoc);
+            minlocVal_ = src1_roi.at<int>(minLoc_);
+            maxlocVal = src1_roi.at<int>(maxLoc);
+            maxlocVal_ = src1_roi.at<int>(maxLoc_);
+            error0 = ::abs(src1_roi.at<int>(minLoc_) - src1_roi.at<int>(minLoc));
+            error1 = ::abs(src1_roi.at<int>(maxLoc_) - src1_roi.at<int>(maxLoc));
         }
-        if(depth == 5)
+        if (depth == 5)
         {
-            minlocVal = mat1_roi.at<float>(minLoc);
-            minlocVal_ = mat1_roi.at<float>(minLoc_);
-            maxlocVal = mat1_roi.at<float>(maxLoc);
-            maxlocVal_ = mat1_roi.at<float>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<float>(minLoc_) - mat1_roi.at<float>(minLoc));
-            error1 = ::abs(mat1_roi.at<float>(maxLoc_) - mat1_roi.at<float>(maxLoc));
+            minlocVal = src1_roi.at<float>(minLoc);
+            minlocVal_ = src1_roi.at<float>(minLoc_);
+            maxlocVal = src1_roi.at<float>(maxLoc);
+            maxlocVal_ = src1_roi.at<float>(maxLoc_);
+            error0 = ::abs(src1_roi.at<float>(minLoc_) - src1_roi.at<float>(minLoc));
+            error1 = ::abs(src1_roi.at<float>(maxLoc_) - src1_roi.at<float>(maxLoc));
         }
-        if(depth == 6)
+        if (depth == 6)
         {
-            minlocVal = mat1_roi.at<double>(minLoc);
-            minlocVal_ = mat1_roi.at<double>(minLoc_);
-            maxlocVal = mat1_roi.at<double>(maxLoc);
-            maxlocVal_ = mat1_roi.at<double>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<double>(minLoc_) - mat1_roi.at<double>(minLoc));
-            error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
+            minlocVal = src1_roi.at<double>(minLoc);
+            minlocVal_ = src1_roi.at<double>(minLoc_);
+            maxlocVal = src1_roi.at<double>(maxLoc);
+            maxlocVal_ = src1_roi.at<double>(maxLoc_);
+            error0 = ::abs(src1_roi.at<double>(minLoc_) - src1_roi.at<double>(minLoc));
+            error1 = ::abs(src1_roi.at<double>(maxLoc_) - src1_roi.at<double>(maxLoc));
         }
 
         EXPECT_DOUBLE_EQ(minVal_, minVal);
@@ -756,27 +904,24 @@ TEST_P(MinMaxLoc, MAT)
     }
 }
 
-
 TEST_P(MinMaxLoc, MASK)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
         double minVal, maxVal;
         cv::Point minLoc, maxLoc;
-        int depth = mat1.depth();
+        int depth = src1.depth();
         if (depth != CV_8S)
-        {
-            cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-        }
+            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
         else
         {
             minVal = std::numeric_limits<double>::max();
             maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < mat1_roi.rows; ++i)
-                for (int j = 0; j < mat1_roi.cols; ++j)
+            for (int i = 0; i < src1_roi.rows; ++i)
+                for (int j = 0; j < src1_roi.cols; ++j)
                 {
-                    signed char val = mat1_roi.at<signed char>(i, j);
+                    signed char val = src1_roi.at<signed char>(i, j);
                     unsigned char m = mask_roi.at<unsigned char>(i , j);
                     if (val < minVal && m)
                     {
@@ -795,72 +940,72 @@ TEST_P(MinMaxLoc, MASK)
 
         double minVal_, maxVal_;
         cv::Point minLoc_, maxLoc_;
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
+        cv::ocl::minMaxLoc(gsrc1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
 
         double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
-        if(minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
-        if(depth == 0)
+        if (minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
+        if (depth == 0)
         {
-            minlocVal = mat1_roi.at<unsigned char>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned char>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned char>(minLoc_) - mat1_roi.at<unsigned char>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned char>(maxLoc_) - mat1_roi.at<unsigned char>(maxLoc));
+            minlocVal = src1_roi.at<unsigned char>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned char>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned char>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned char>(minLoc_) - src1_roi.at<unsigned char>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned char>(maxLoc_) - src1_roi.at<unsigned char>(maxLoc));
         }
-        if(depth == 1)
+        if (depth == 1)
         {
-            minlocVal = mat1_roi.at<signed char>(minLoc);
-            minlocVal_ = mat1_roi.at<signed char>(minLoc_);
-            maxlocVal = mat1_roi.at<signed char>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed char>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed char>(minLoc_) - mat1_roi.at<signed char>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed char>(maxLoc_) - mat1_roi.at<signed char>(maxLoc));
+            minlocVal = src1_roi.at<signed char>(minLoc);
+            minlocVal_ = src1_roi.at<signed char>(minLoc_);
+            maxlocVal = src1_roi.at<signed char>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed char>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed char>(minLoc_) - src1_roi.at<signed char>(minLoc));
+            error1 = ::abs(src1_roi.at<signed char>(maxLoc_) - src1_roi.at<signed char>(maxLoc));
         }
-        if(depth == 2)
+        if (depth == 2)
         {
-            minlocVal = mat1_roi.at<unsigned short>(minLoc);
-            minlocVal_ = mat1_roi.at<unsigned short>(minLoc_);
-            maxlocVal = mat1_roi.at<unsigned short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<unsigned short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<unsigned short>(minLoc_) - mat1_roi.at<unsigned short>(minLoc));
-            error1 = ::abs(mat1_roi.at<unsigned short>(maxLoc_) - mat1_roi.at<unsigned short>(maxLoc));
+            minlocVal = src1_roi.at<unsigned short>(minLoc);
+            minlocVal_ = src1_roi.at<unsigned short>(minLoc_);
+            maxlocVal = src1_roi.at<unsigned short>(maxLoc);
+            maxlocVal_ = src1_roi.at<unsigned short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<unsigned short>(minLoc_) - src1_roi.at<unsigned short>(minLoc));
+            error1 = ::abs(src1_roi.at<unsigned short>(maxLoc_) - src1_roi.at<unsigned short>(maxLoc));
         }
-        if(depth == 3)
+        if (depth == 3)
         {
-            minlocVal = mat1_roi.at<signed short>(minLoc);
-            minlocVal_ = mat1_roi.at<signed short>(minLoc_);
-            maxlocVal = mat1_roi.at<signed short>(maxLoc);
-            maxlocVal_ = mat1_roi.at<signed short>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<signed short>(minLoc_) - mat1_roi.at<signed short>(minLoc));
-            error1 = ::abs(mat1_roi.at<signed short>(maxLoc_) - mat1_roi.at<signed short>(maxLoc));
+            minlocVal = src1_roi.at<signed short>(minLoc);
+            minlocVal_ = src1_roi.at<signed short>(minLoc_);
+            maxlocVal = src1_roi.at<signed short>(maxLoc);
+            maxlocVal_ = src1_roi.at<signed short>(maxLoc_);
+            error0 = ::abs(src1_roi.at<signed short>(minLoc_) - src1_roi.at<signed short>(minLoc));
+            error1 = ::abs(src1_roi.at<signed short>(maxLoc_) - src1_roi.at<signed short>(maxLoc));
         }
-        if(depth == 4)
+        if (depth == 4)
         {
-            minlocVal = mat1_roi.at<int>(minLoc);
-            minlocVal_ = mat1_roi.at<int>(minLoc_);
-            maxlocVal = mat1_roi.at<int>(maxLoc);
-            maxlocVal_ = mat1_roi.at<int>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<int>(minLoc_) - mat1_roi.at<int>(minLoc));
-            error1 = ::abs(mat1_roi.at<int>(maxLoc_) - mat1_roi.at<int>(maxLoc));
+            minlocVal = src1_roi.at<int>(minLoc);
+            minlocVal_ = src1_roi.at<int>(minLoc_);
+            maxlocVal = src1_roi.at<int>(maxLoc);
+            maxlocVal_ = src1_roi.at<int>(maxLoc_);
+            error0 = ::abs(src1_roi.at<int>(minLoc_) - src1_roi.at<int>(minLoc));
+            error1 = ::abs(src1_roi.at<int>(maxLoc_) - src1_roi.at<int>(maxLoc));
         }
-        if(depth == 5)
+        if (depth == 5)
         {
-            minlocVal = mat1_roi.at<float>(minLoc);
-            minlocVal_ = mat1_roi.at<float>(minLoc_);
-            maxlocVal = mat1_roi.at<float>(maxLoc);
-            maxlocVal_ = mat1_roi.at<float>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<float>(minLoc_) - mat1_roi.at<float>(minLoc));
-            error1 = ::abs(mat1_roi.at<float>(maxLoc_) - mat1_roi.at<float>(maxLoc));
+            minlocVal = src1_roi.at<float>(minLoc);
+            minlocVal_ = src1_roi.at<float>(minLoc_);
+            maxlocVal = src1_roi.at<float>(maxLoc);
+            maxlocVal_ = src1_roi.at<float>(maxLoc_);
+            error0 = ::abs(src1_roi.at<float>(minLoc_) - src1_roi.at<float>(minLoc));
+            error1 = ::abs(src1_roi.at<float>(maxLoc_) - src1_roi.at<float>(maxLoc));
         }
-        if(depth == 6)
+        if (depth == 6)
         {
-            minlocVal = mat1_roi.at<double>(minLoc);
-            minlocVal_ = mat1_roi.at<double>(minLoc_);
-            maxlocVal = mat1_roi.at<double>(maxLoc);
-            maxlocVal_ = mat1_roi.at<double>(maxLoc_);
-            error0 = ::abs(mat1_roi.at<double>(minLoc_) - mat1_roi.at<double>(minLoc));
-            error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
+            minlocVal = src1_roi.at<double>(minLoc);
+            minlocVal_ = src1_roi.at<double>(minLoc_);
+            maxlocVal = src1_roi.at<double>(maxLoc);
+            maxlocVal_ = src1_roi.at<double>(maxLoc_);
+            error0 = ::abs(src1_roi.at<double>(minLoc_) - src1_roi.at<double>(minLoc));
+            error1 = ::abs(src1_roi.at<double>(maxLoc_) - src1_roi.at<double>(maxLoc));
         }
 
         EXPECT_DOUBLE_EQ(minVal_, minVal);
@@ -873,16 +1018,18 @@ TEST_P(MinMaxLoc, MASK)
     }
 }
 
+//////////////////////////////// Sum /////////////////////////////////////////////////
 
-struct Sum : ArithmTestBase {};
+typedef ArithmTestBase Sum;
 
-TEST_P(Sum, MAT)
+TEST_P(Sum, DISABLED_MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        Scalar cpures = cv::sum(mat1_roi);
-        Scalar gpures = cv::ocl::sum(gmat1);
+
+        Scalar cpures = cv::sum(src1_roi);
+        Scalar gpures = cv::ocl::sum(gsrc1);
 
         //check results
         EXPECT_NEAR(cpures[0], gpures[0], 0.1);
@@ -892,424 +1039,294 @@ TEST_P(Sum, MAT)
     }
 }
 
+//////////////////////////////// CountNonZero /////////////////////////////////////////////////
 
-struct CountNonZero : ArithmTestBase {};
+typedef ArithmTestBase CountNonZero;
 
 TEST_P(CountNonZero, MAT)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        int cpures = cv::countNonZero(mat1_roi);
-        int gpures = cv::ocl::countNonZero(gmat1);
+        int cpures = cv::countNonZero(src1_roi);
+        int gpures = cv::ocl::countNonZero(gsrc1);
 
         EXPECT_DOUBLE_EQ((double)cpures, (double)gpures);
     }
 }
 
+//////////////////////////////// Phase /////////////////////////////////////////////////
 
+typedef ArithmTestBase Phase;
 
-////////////////////////////////phase/////////////////////////////////////////////////
-struct Phase : ArithmTestBase {};
-
-TEST_P(Phase, Mat)
+TEST_P(Phase, DISABLED_Mat)
 {
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
+    for (int angelInDegrees = 0; angelInDegrees < 2; angelInDegrees++)
     {
-        cout << "\tUnsupported type\t\n";
-    }
-    for(int angelInDegrees = 0; angelInDegrees < 2; angelInDegrees++)
-    {
-        for(int j = 0; j < LOOP_TIMES; j++)
+        for (int j = 0; j < LOOP_TIMES; j++)
         {
             random_roi();
-            cv::phase(mat1_roi, mat2_roi, dst_roi, angelInDegrees ? true : false);
-            cv::ocl::phase(gmat1, gmat2, gdst, angelInDegrees ? true : false);
+            cv::phase(src1_roi, src2_roi, dst1_roi, angelInDegrees ? true : false);
+            cv::ocl::phase(gsrc1, gsrc2, gdst1, angelInDegrees ? true : false);
             Near(1e-2);
         }
     }
 }
 
+//////////////////////////////// Bitwise_and /////////////////////////////////////////////////
 
-////////////////////////////////bitwise_and/////////////////////////////////////////////////
-struct Bitwise_and : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_and;
 
 TEST_P(Bitwise_and, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst);
+        cv::bitwise_and(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::bitwise_and(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_and, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
+        cv::bitwise_and(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::bitwise_and(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_and, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, val, dst_roi);
-        cv::ocl::bitwise_and(gmat1, val, gdst);
+        cv::bitwise_and(src1_roi, val, dst1_roi);
+        cv::ocl::bitwise_and(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Bitwise_and, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
+        cv::bitwise_and(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::bitwise_and(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Bitwise_or /////////////////////////////////////////////////
 
-
-////////////////////////////////bitwise_or/////////////////////////////////////////////////
-
-struct Bitwise_or : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_or;
 
 TEST_P(Bitwise_or, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst);
+        cv::bitwise_or(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::bitwise_or(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_or, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
+        cv::bitwise_or(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::bitwise_or(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_or, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, val, dst_roi);
-        cv::ocl::bitwise_or(gmat1, val, gdst);
+        cv::bitwise_or(src1_roi, val, dst1_roi);
+        cv::ocl::bitwise_or(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Bitwise_or, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
+        cv::bitwise_or(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::bitwise_or(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Bitwise_xor /////////////////////////////////////////////////
 
-
-////////////////////////////////bitwise_xor/////////////////////////////////////////////////
-
-struct Bitwise_xor : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_xor;
 
 TEST_P(Bitwise_xor, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
+        cv::bitwise_xor(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::bitwise_xor(gsrc1, gsrc2, gdst1);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_xor, Mat_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
+        cv::bitwise_xor(src1_roi, src2_roi, dst1_roi, mask_roi);
+        cv::ocl::bitwise_xor(gsrc1, gsrc2, gdst1, gmask);
         Near(0);
     }
 }
 
 TEST_P(Bitwise_xor, Scalar)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, val, dst_roi);
-        cv::ocl::bitwise_xor(gmat1, val, gdst);
+        cv::bitwise_xor(src1_roi, val, dst1_roi);
+        cv::ocl::bitwise_xor(gsrc1, val, gdst1);
         Near(1e-5);
     }
 }
 
 TEST_P(Bitwise_xor, Scalar_Mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
-        cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
+        cv::bitwise_xor(src1_roi, val, dst1_roi, mask_roi);
+        cv::ocl::bitwise_xor(gsrc1, val, gdst1, gmask);
         Near(1e-5);
     }
 }
 
+//////////////////////////////// Bitwise_not /////////////////////////////////////////////////
 
-////////////////////////////////bitwise_not/////////////////////////////////////////////////
-
-struct Bitwise_not : ArithmTestBase {};
+typedef ArithmTestBase Bitwise_not;
 
 TEST_P(Bitwise_not, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        cv::bitwise_not(mat1_roi, dst_roi);
-        cv::ocl::bitwise_not(gmat1, gdst);
+        cv::bitwise_not(src1_roi, dst1_roi);
+        cv::ocl::bitwise_not(gsrc1, gdst1);
         Near(0);
     }
 }
 
+//////////////////////////////// Compare /////////////////////////////////////////////////
 
-////////////////////////////////compare/////////////////////////////////////////////////
-struct Compare : ArithmTestBase {};
+typedef ArithmTestBase Compare;
 
 TEST_P(Compare, Mat)
 {
-    if(mat1.type() == CV_8SC1)
-        //if(mat1.type() != CV_8UC1 || mat1.type()!= CV_16UC1 || mat1.type()!= CV_16SC1|| mat1.type()!= CV_32SC1 || mat1.type()!= CV_32FC1|| mat1.type()!= CV_64FC1)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-    int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
-    //const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
+    int cmp_codes[] = { CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE };
     int cmp_num = sizeof(cmp_codes) / sizeof(int);
 
     for (int i = 0; i < cmp_num; ++i)
-    {
-
-        for(int j = 0; j < LOOP_TIMES; j++)
+        for (int j = 0; j < LOOP_TIMES; j++)
         {
             random_roi();
 
-            cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]);
-            cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
+            cv::compare(src1_roi, src2_roi, dst1_roi, cmp_codes[i]);
+            cv::ocl::compare(gsrc1, gsrc2, gdst1, cmp_codes[i]);
+
             Near(0);
         }
-    }
-
 }
 
+//////////////////////////////// Pow /////////////////////////////////////////////////
 
-struct Pow : ArithmTestBase {};
+typedef ArithmTestBase Pow;
 
 TEST_P(Pow, Mat)
 {
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
         double p = 4.5;
-        cv::pow(mat1_roi, p, dst_roi);
-        cv::ocl::pow(gmat1, p, gdst);
+        cv::pow(src1_roi, p, dst1_roi);
+        cv::ocl::pow(gsrc1, p, gdst1);
         Near(1);
     }
 }
 
+//////////////////////////////// AddWeighted /////////////////////////////////////////////////
 
-struct MagnitudeSqr : ArithmTestBase {};
-
-TEST_P(MagnitudeSqr, Mat)
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        for(int i = 0; i < mat1.rows; ++i)
-            for(int j = 0; j < mat1.cols; ++j)
-            {
-                float val1 = mat1.at<float>(i, j);
-                float val2 = mat2.at<float>(i, j);
-                ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-            }
-
-        cv::ocl::oclMat clmat1(mat1), clmat2(mat2);
-        cv::ocl::magnitudeSqr(clmat1, clmat2, gdst);
-        Near(1);
-    }
-}
-
-
-struct AddWeighted : ArithmTestBase {};
+typedef ArithmTestBase AddWeighted;
 
 TEST_P(AddWeighted, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
-        double alpha = 2.0, beta = 1.0, gama = 3.0;
 
+        const double alpha = 2.0, beta = 1.0, gama = 3.0;
 
-        cv::addWeighted(mat1_roi, alpha, mat2_roi, beta, gama, dst_roi);
-
-        //	cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-
-        cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
+        cv::addWeighted(src1_roi, alpha, src2_roi, beta, gama, dst1_roi);
+        cv::ocl::addWeighted(gsrc1, alpha, gsrc2, beta, gama, gdst1);
 
         Near(1e-5);
     }
 }
 
+//////////////////////////////////////// Instantiation /////////////////////////////////////////
 
-
-
-//********test****************
-
-INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(
-                            Values(CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(
-                            Values(CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(
-                            Values(CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-// Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));
-// Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
-// Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, MagnitudeSqr, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-
+INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool(), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
 
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_bgfg.cpp b/modules/ocl/test/test_bgfg.cpp
index d7492d600..4c495f5e0 100644
--- a/modules/ocl/test/test_bgfg.cpp
+++ b/modules/ocl/test/test_bgfg.cpp
@@ -53,7 +53,20 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-extern string workdir;
+#if defined(HAVE_XINE)         || \
+    defined(HAVE_GSTREAMER)    || \
+    defined(HAVE_QUICKTIME)    || \
+    defined(HAVE_AVFOUNDATION) || \
+    defined(HAVE_FFMPEG)       || \
+    defined(WIN32)
+
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
+#else
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
+#endif
+
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 //////////////////////////////////////////////////////
 // MOG
 
@@ -225,3 +238,5 @@ INSTANTIATE_TEST_CASE_P(OCL_Video, mog2, testing::Combine(
     Values(true, false)));
 
 #endif
+
+#endif
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
index 4cad2fabe..3840dcc14 100644
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
@@ -482,7 +482,6 @@ struct CopyMakeBorder : ImgprocTestBase {};
 TEST_P(CopyMakeBorder, Mat)
 {
     int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT, cv::BORDER_WRAP, cv::BORDER_REFLECT_101};
-    //const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
     cv::RNG &rng = TS::ptr()->get_rng();
     int top = rng.uniform(0, 10);
     int bottom = rng.uniform(0, 10);
@@ -895,8 +894,7 @@ TEST_P(Remap, Mat)
         return;
     }
     int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
-    //const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-    // for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++)
+
     for(int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
@@ -908,7 +906,6 @@ TEST_P(Remap, Mat)
         if(interpolation == 0)
             EXPECT_MAT_NEAR(dst, cpu_dst, 1.0);
         EXPECT_MAT_NEAR(dst, cpu_dst, 2.0);
-
     }
 }
 
diff --git a/modules/ocl/test/test_kalman.cpp b/modules/ocl/test/test_kalman.cpp
new file mode 100644
index 000000000..13f9d0b81
--- /dev/null
+++ b/modules/ocl/test/test_kalman.cpp
@@ -0,0 +1,146 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma, jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////////
+PARAM_TEST_CASE(Kalman, int, int)
+{
+    int size_;
+    int iteration;
+    virtual void SetUp()
+    {
+        size_ = GET_PARAM(0);
+        iteration = GET_PARAM(1);
+    }
+};
+
+TEST_P(Kalman, Accuracy)
+{
+    const int Dim = size_;
+    const int Steps = iteration;
+    const double max_init = 1;
+    const double max_noise = 0.1;
+
+    cv::RNG &rng = TS::ptr()->get_rng();
+
+    Mat sample_mat(Dim, 1, CV_32F), temp_mat;
+    oclMat Sample(Dim, 1, CV_32F);
+    oclMat Temp(Dim, 1, CV_32F);
+    Mat Temp_cpu(Dim, 1, CV_32F);
+
+    Size size(Sample.cols, Sample.rows);
+
+    sample_mat =  randomMat(rng, size, Sample.type(), -max_init, max_init, false);
+    Sample.upload(sample_mat);
+
+    //ocl start
+    cv::ocl::KalmanFilter kalman_filter_ocl;
+    kalman_filter_ocl.init(Dim, Dim);
+
+    cv::ocl::setIdentity(kalman_filter_ocl.errorCovPre, 1);
+    cv::ocl::setIdentity(kalman_filter_ocl.measurementMatrix, 1);
+    cv::ocl::setIdentity(kalman_filter_ocl.errorCovPost, 1);
+
+    kalman_filter_ocl.measurementNoiseCov.setTo(Scalar::all(0));
+    kalman_filter_ocl.statePre.setTo(Scalar::all(0));
+    kalman_filter_ocl.statePost.setTo(Scalar::all(0));
+
+    kalman_filter_ocl.correct(Sample);
+    //ocl end
+
+    //cpu start
+    cv::KalmanFilter kalman_filter_cpu;
+
+    kalman_filter_cpu.init(Dim, Dim);
+
+    cv::setIdentity(kalman_filter_cpu.errorCovPre, 1);
+    cv::setIdentity(kalman_filter_cpu.measurementMatrix, 1);
+    cv::setIdentity(kalman_filter_cpu.errorCovPost, 1);
+
+    kalman_filter_cpu.measurementNoiseCov.setTo(Scalar::all(0));
+    kalman_filter_cpu.statePre.setTo(Scalar::all(0));
+    kalman_filter_cpu.statePost.setTo(Scalar::all(0));
+
+    kalman_filter_cpu.correct(sample_mat);
+    //cpu end
+    //test begin
+    for(int i = 0; i<Steps; i++)
+    {
+        kalman_filter_ocl.predict();
+        kalman_filter_cpu.predict();
+
+        cv::gemm(kalman_filter_cpu.transitionMatrix, sample_mat, 1, cv::Mat(), 0, Temp_cpu);
+
+        Size size1(Temp.cols, Temp.rows);
+        Mat temp = randomMat(rng, size1, Temp.type(), 0, 0xffff, false);
+
+
+        cv::multiply(2, temp, temp);
+
+        cv::subtract(temp, 1, temp);
+
+        cv::multiply(max_noise, temp, temp);
+
+        cv::add(temp, Temp_cpu, Temp_cpu);
+
+        Temp.upload(Temp_cpu);
+        Temp.copyTo(Sample);
+        Temp_cpu.copyTo(sample_mat);
+
+        kalman_filter_ocl.correct(Temp);
+        kalman_filter_cpu.correct(Temp_cpu);
+    }
+    //test end
+    EXPECT_MAT_NEAR(kalman_filter_cpu.statePost, kalman_filter_ocl.statePost, 0);
+}
+INSTANTIATE_TEST_CASE_P(OCL_Video, Kalman, Combine(Values(3, 7), Values(30)));
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp
index 551c9ff12..651d34b81 100644
--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
@@ -52,8 +52,6 @@
 
 IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
 
-const char *TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
-
 #define MTEMP_SIZES testing::Values(cv::Size(128, 256), cv::Size(1024, 768))
 
 PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod)
@@ -74,12 +72,6 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho
 
 TEST_P(MatchTemplate8U, Accuracy)
 {
-
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-
     cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
     cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
 
@@ -101,7 +93,6 @@ PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMeth
     cv::Size templ_size;
     int cn;
     int method;
-    //std::vector<cv::ocl::Info> oclinfo;
 
     virtual void SetUp()
     {
diff --git a/modules/ocl/test/test_matrix_operation.cpp b/modules/ocl/test/test_matrix_operation.cpp
index e8b502232..b70ee6ccd 100644
--- a/modules/ocl/test/test_matrix_operation.cpp
+++ b/modules/ocl/test/test_matrix_operation.cpp
@@ -53,189 +53,181 @@ using namespace testing;
 using namespace std;
 
 ////////////////////////////////converto/////////////////////////////////////////////////
-PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType)
-{
-    int type;
-    int dst_type;
 
-    //src mat
+PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType, int, bool)
+{
+    int src_depth, dst_depth;
+    int cn, dst_type;
+    bool use_roi;
+
+    // src mat
     cv::Mat mat;
     cv::Mat dst;
 
     // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
+    int roicols, roirows;
+    int srcx, srcy;
+    int dstx, dsty;
 
-    //src mat with roi
+    // src mat with roi
     cv::Mat mat_roi;
     cv::Mat dst_roi;
 
-    //ocl dst mat for testing
+    // ocl dst mat for testing
     cv::ocl::oclMat gdst_whole;
 
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc;
     cv::ocl::oclMat gdst;
 
     virtual void SetUp()
     {
-        type     = GET_PARAM(0);
-        dst_type = GET_PARAM(1);
+        src_depth = GET_PARAM(0);
+        dst_depth = GET_PARAM(1);
+        cn = GET_PARAM(2);
+        int src_type = CV_MAKE_TYPE(src_depth, cn);
+        dst_type = CV_MAKE_TYPE(dst_depth, cn);
+
+        use_roi = GET_PARAM(3);
 
         cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
+        mat = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), src_type, 5, 136, false);
+        dst = randomMat(rng, use_roi ? randomSize(MIN_VALUE, MAX_VALUE) : mat.size(), dst_type, 5, 136, false);
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat.cols);
-        roirows = rng.uniform(1, mat.rows);
-        srcx   = rng.uniform(0, mat.cols - roicols);
-        srcy   = rng.uniform(0, mat.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-#else
-        roicols = mat.cols;
-        roirows = mat.rows;
-        srcx = 0;
-        srcy = 0;
-        dstx = 0;
-        dsty = 0;
-#endif
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+            srcx = rng.uniform(0, mat.cols - roicols);
+            srcy = rng.uniform(0, mat.rows - roirows);
+            dstx = rng.uniform(0, dst.cols - roicols);
+            dsty = rng.uniform(0, dst.rows - roirows);
+        }
+        else
+        {
+            roicols = mat.cols;
+            roirows = mat.rows;
+            srcx = srcy = 0;
+            dstx = dsty = 0;
+        }
 
         mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
 
         gdst_whole = dst;
         gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-        gmat = mat_roi;
+        gsrc = mat_roi;
     }
 };
 
-
-struct ConvertTo : ConvertToTestBase {};
+typedef ConvertToTestBase ConvertTo;
 
 TEST_P(ConvertTo, Accuracy)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
         mat_roi.convertTo(dst_roi, dst_type);
-        gmat.convertTo(gdst, dst_type);
+        gsrc.convertTo(gdst, dst_type);
 
-        EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0);
+        EXPECT_MAT_NEAR(dst, Mat(gdst_whole), src_depth == CV_64F ? 1.0 : 0.0);
+        EXPECT_MAT_NEAR(dst_roi, Mat(gdst), src_depth == CV_64F ? 1.0 : 0.0);
     }
 }
 
-
-
-
 ///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
 
-PARAM_TEST_CASE(CopyToTestBase, MatType, bool)
+PARAM_TEST_CASE(CopyToTestBase, MatType, int, bool)
 {
-    int type;
+    bool use_roi;
 
-    cv::Mat mat;
-    cv::Mat mask;
-    cv::Mat dst;
+    cv::Mat src, mask, dst;
 
     // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
+    int roicols,roirows;
+    int srcx, srcy;
+    int dstx, dsty;
+    int maskx,masky;
 
-    //src mat with roi
-    cv::Mat mat_roi;
+    // src mat with roi
+    cv::Mat src_roi;
     cv::Mat mask_roi;
     cv::Mat dst_roi;
 
-    //ocl dst mat for testing
+    // ocl dst mat for testing
     cv::ocl::oclMat gdst_whole;
 
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gmask;
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc, gdst, gmask;
 
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
+        int type = CV_MAKETYPE(GET_PARAM(0), GET_PARAM(1));
+        use_roi = GET_PARAM(2);
 
         cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+        src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        dst = randomMat(rng, use_roi ? randomSize(MIN_VALUE, MAX_VALUE) : src.size(), type, 5, 16, false);
+        mask = randomMat(rng, use_roi ? randomSize(MIN_VALUE, MAX_VALUE) : src.size(), CV_8UC1, 0, 2,  false);
 
         cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat.cols);
-        roirows = rng.uniform(1, mat.rows);
-        srcx   = rng.uniform(0, mat.cols - roicols);
-        srcy   = rng.uniform(0, mat.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-#else
-        roicols = mat.cols;
-        roirows = mat.rows;
-        srcx = 0;
-        srcy = 0;
-        dstx = 0;
-        dsty = 0;
-        maskx = 0;
-        masky = 0;
-#endif
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+            srcx = rng.uniform(0, src.cols - roicols);
+            srcy = rng.uniform(0, src.rows - roirows);
+            dstx = rng.uniform(0, dst.cols - roicols);
+            dsty = rng.uniform(0, dst.rows - roirows);
+            maskx = rng.uniform(0, mask.cols - roicols);
+            masky = rng.uniform(0, mask.rows - roirows);
+        }
+        else
+        {
+            roicols = src.cols;
+            roirows = src.rows;
+            srcx = srcy = 0;
+            dstx = dsty = 0;
+            maskx = masky = 0;
+        }
 
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
+        src_roi = src(Rect(srcx, srcy, roicols, roirows));
         mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
 
         gdst_whole = dst;
         gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-        gmat = mat_roi;
+        gsrc = src_roi;
         gmask = mask_roi;
     }
 };
 
-struct CopyTo : CopyToTestBase {};
+typedef CopyToTestBase CopyTo;
 
 TEST_P(CopyTo, Without_mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        mat_roi.copyTo(dst_roi);
-        gmat.copyTo(gdst);
+        src_roi.copyTo(dst_roi);
+        gsrc.copyTo(gdst);
 
         EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0);
     }
@@ -243,221 +235,199 @@ TEST_P(CopyTo, Without_mask)
 
 TEST_P(CopyTo, With_mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        mat_roi.copyTo(dst_roi, mask_roi);
-        gmat.copyTo(gdst, gmask);
+        src_roi.copyTo(dst_roi, mask_roi);
+        gsrc.copyTo(gdst, gmask);
 
         EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0);
     }
 }
 
+/////////////////////////////////////////// setTo /////////////////////////////////////////////////////////////
 
-
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(SetToTestBase, MatType, bool)
+PARAM_TEST_CASE(SetToTestBase, MatType, int, bool)
 {
-    int type;
+    int depth, channels;
+    bool use_roi;
+
     cv::Scalar val;
 
-    cv::Mat mat;
+    cv::Mat src;
     cv::Mat mask;
 
     // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int maskx;
-    int masky;
+    int roicols, roirows;
+    int srcx, srcy;
+    int maskx, masky;
 
-    //src mat with roi
-    cv::Mat mat_roi;
+    // src mat with roi
+    cv::Mat src_roi;
     cv::Mat mask_roi;
 
-    //ocl dst mat for testing
-    cv::ocl::oclMat gmat_whole;
+    // ocl dst mat for testing
+    cv::ocl::oclMat gsrc_whole;
 
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
+    // ocl mat with roi
+    cv::ocl::oclMat gsrc;
     cv::ocl::oclMat gmask;
 
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
+        depth = GET_PARAM(0);
+        channels = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
 
         cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+        int type = CV_MAKE_TYPE(depth, channels);
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+        src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
+        mask = randomMat(rng, use_roi ? randomSize(MIN_VALUE, MAX_VALUE) : src.size(), CV_8UC1, 0, 2,  false);
 
         cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-
+        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0),
+                         rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat.cols);
-        roirows = rng.uniform(1, mat.rows);
-        srcx   = rng.uniform(0, mat.cols - roicols);
-        srcy   = rng.uniform(0, mat.rows - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-#else
-        roicols = mat.cols;
-        roirows = mat.rows;
-        srcx = 0;
-        srcy = 0;
-        maskx = 0;
-        masky = 0;
-#endif
+        if (use_roi)
+        {
+            // randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+            roicols = rng.uniform(1, MIN_VALUE);
+            roirows = rng.uniform(1, MIN_VALUE);
+            srcx = rng.uniform(0, src.cols - roicols);
+            srcy = rng.uniform(0, src.rows - roirows);
+            maskx = rng.uniform(0, mask.cols - roicols);
+            masky = rng.uniform(0, mask.rows - roirows);
+        }
+        else
+        {
+            roicols = src.cols;
+            roirows = src.rows;
+            srcx = srcy = 0;
+            maskx = masky = 0;
+        }
 
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
+        src_roi = src(Rect(srcx, srcy, roicols, roirows));
         mask_roi = mask(Rect(maskx, masky, roicols, roirows));
 
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
+        gsrc_whole = src;
+        gsrc = gsrc_whole(Rect(srcx, srcy, roicols, roirows));
 
         gmask = mask_roi;
     }
 };
 
-struct SetTo : SetToTestBase {};
+typedef SetToTestBase SetTo;
 
 TEST_P(SetTo, Without_mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        mat_roi.setTo(val);
-        gmat.setTo(val);
+        src_roi.setTo(val);
+        gsrc.setTo(val);
 
-        EXPECT_MAT_NEAR(mat, Mat(gmat_whole), 1.);
+        EXPECT_MAT_NEAR(src, Mat(gsrc_whole), 1.);
     }
 }
 
 TEST_P(SetTo, With_mask)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         random_roi();
 
-        mat_roi.setTo(val, mask_roi);
-        gmat.setTo(val, gmask);
+        src_roi.setTo(val, mask_roi);
+        gsrc.setTo(val, gmask);
 
-        EXPECT_MAT_NEAR(mat, Mat(gmat_whole), 1.);
+        EXPECT_MAT_NEAR(src, Mat(gsrc_whole), 1.);
     }
 }
 
-//convertC3C4
-PARAM_TEST_CASE(convertC3C4, MatType, cv::Size)
+// convertC3C4
+
+PARAM_TEST_CASE(convertC3C4, MatType, bool)
 {
-    int type;
-    cv::Size ksize;
+    int depth;
+    bool use_roi;
 
     //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
+    cv::Mat src;
 
     // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
+    int roicols, roirows;
+    int srcx, srcy;
 
     //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
+    cv::Mat src_roi;
 
     //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
+    cv::ocl::oclMat gsrc_roi;
 
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
+        depth = GET_PARAM(0);
+        use_roi = GET_PARAM(1);
+        int type = CV_MAKE_TYPE(depth, 3);
 
+        cv::RNG &rng = TS::ptr()->get_rng();
+        src = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 0, 40, false);
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(2, mat1.cols);
-        roirows = rng.uniform(2, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-#else
-        roicols = mat1.cols;
-        roirows = mat1.rows;
-        src1x = 0;
-        src1y = 0;
-        dstx = 0;
-        dsty = 0;
-#endif
+        if (use_roi)
+        {
+            //randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+            roicols = rng.uniform(1, src.cols);
+            roirows = rng.uniform(1, src.rows);
+            srcx = rng.uniform(0, src.cols - roicols);
+            srcy = rng.uniform(0, src.rows - roirows);
+        }
+        else
+        {
+            roicols = src.cols;
+            roirows = src.rows;
+            srcx = srcy = 0;
+        }
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-
-        gmat1 = mat1_roi;
+        src_roi = src(Rect(srcx, srcy, roicols, roirows));
     }
-
 };
 
 TEST_P(convertC3C4, Accuracy)
 {
-    cv::RNG &rng = TS::ptr()->get_rng();
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
-        //random_roi();
-        int width = rng.uniform(2, MWIDTH);
-        int height = rng.uniform(2, MHEIGHT);
-        cv::Size size(width, height);
+        random_roi();
 
-        mat1 = randomMat(rng, size, type, 0, 40, false);
-        gmat1 = mat1;
+        gsrc_roi = src_roi;
 
-        EXPECT_MAT_NEAR(mat1, Mat(gmat1), 0.0);
+        EXPECT_MAT_NEAR(src_roi, Mat(gsrc_roi), 0.0);
     }
-
 }
 
 INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4),
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4)));
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
+                            Range(1, 5), Bool()));
 
 INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
+                            testing::Range(1, 5), Bool()));
 
 INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
+                            testing::Range(1, 5), Bool()));
 
 INSTANTIATE_TEST_CASE_P(MatrixOperation, convertC3C4, Combine(
-                            Values(CV_8UC3,  CV_32SC3,  CV_32FC3),
-                            Values(cv::Size())));
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
+                            Bool()));
 #endif
diff --git a/modules/ocl/test/test_ml.cpp b/modules/ocl/test/test_ml.cpp
new file mode 100644
index 000000000..af86d35a6
--- /dev/null
+++ b/modules/ocl/test/test_ml.cpp
@@ -0,0 +1,300 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jin Ma,        jin@multicorewareinc.com
+//    Xiaopeng Fu,   fuxiaopeng2222@163.com
+//    Erping Pang,   pang_er_ping@163.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+///////K-NEAREST NEIGHBOR//////////////////////////
+static void genTrainData(Mat& trainData, int trainDataRow, int trainDataCol,
+                         Mat& trainLabel = Mat().setTo(Scalar::all(0)), int nClasses = 0)
+{
+    cv::RNG &rng = TS::ptr()->get_rng();
+    cv::Size size(trainDataCol, trainDataRow);
+    trainData = randomMat(rng, size, CV_32FC1, 1.0, 1000.0, false);
+    if(nClasses != 0)
+    {
+        cv::Size size1(trainDataRow, 1);
+        trainLabel = randomMat(rng, size1, CV_8UC1, 0, nClasses - 1, false);
+        trainLabel.convertTo(trainLabel, CV_32FC1);
+    }
+}
+
+PARAM_TEST_CASE(KNN, int, Size, int, bool)
+{
+    int k;
+    int trainDataCol;
+    int testDataRow;
+    int nClass;
+    bool regression;
+    virtual void SetUp()
+    {
+        k = GET_PARAM(0);
+        nClass = GET_PARAM(2);
+        trainDataCol = GET_PARAM(1).width;
+        testDataRow = GET_PARAM(1).height;
+        regression = GET_PARAM(3);
+    }
+};
+
+TEST_P(KNN, Accuracy)
+{
+    Mat trainData, trainLabels;
+    const int trainDataRow = 500;
+    genTrainData(trainData, trainDataRow, trainDataCol, trainLabels, nClass);
+
+    Mat testData, testLabels;
+    genTrainData(testData, testDataRow, trainDataCol);
+
+    KNearestNeighbour knn_ocl;
+    CvKNearest knn_cpu;
+    Mat best_label_cpu;
+    oclMat best_label_ocl;
+
+    /*ocl k-Nearest_Neighbor start*/
+    oclMat trainData_ocl;
+    trainData_ocl.upload(trainData);
+    Mat simpleIdx;
+    knn_ocl.train(trainData, trainLabels, simpleIdx, regression);
+
+    oclMat testdata;
+    testdata.upload(testData);
+    knn_ocl.find_nearest(testdata, k, best_label_ocl);
+    /*ocl k-Nearest_Neighbor end*/
+
+    /*cpu k-Nearest_Neighbor start*/
+    knn_cpu.train(trainData, trainLabels, simpleIdx, regression);
+    knn_cpu.find_nearest(testData, k, &best_label_cpu);
+    /*cpu k-Nearest_Neighbor end*/
+    if(regression)
+    {
+        EXPECT_MAT_SIMILAR(Mat(best_label_ocl), best_label_cpu, 1e-5);
+    }
+    else
+    {
+        EXPECT_MAT_NEAR(Mat(best_label_ocl), best_label_cpu, 0.0);
+    }
+}
+INSTANTIATE_TEST_CASE_P(OCL_ML, KNN, Combine(Values(6, 5), Values(Size(200, 400), Size(300, 600)),
+    Values(4, 3), Values(false, true)));
+
+////////////////////////////////SVM/////////////////////////////////////////////////
+PARAM_TEST_CASE(SVM_OCL, int, int, int)
+{
+    cv::Size size;
+    int kernel_type;
+    int svm_type;
+    Mat src, labels, samples, labels_predict;
+    int K;
+    cv::RNG rng ;
+
+    virtual void SetUp()
+    {
+
+        kernel_type = GET_PARAM(0);
+        svm_type = GET_PARAM(1);
+        K = GET_PARAM(2);
+        rng = TS::ptr()->get_rng();
+        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
+        src.create(size, CV_32FC1);
+        labels.create(1, size.height, CV_32SC1);
+        int row_idx = 0;
+        const int max_number = size.height / K - 1;
+        CV_Assert(K <= size.height);
+        for(int i = 0; i < K; i++ )
+        {
+            Mat center_row_header = src.row(row_idx);
+            center_row_header.setTo(0);
+            int nchannel = center_row_header.channels();
+            for(int j = 0; j < nchannel; j++)
+            {
+                center_row_header.at<float>(0, i * nchannel + j) = 500.0;
+            }
+            labels.at<int>(0, row_idx) = i;
+            for(int j = 0; (j < max_number) ||
+                    (i == K - 1 && j < max_number + size.height % K); j ++)
+            {
+                Mat cur_row_header = src.row(row_idx + 1 + j);
+                center_row_header.copyTo(cur_row_header);
+                Mat tmpmat = randomMat(rng, cur_row_header.size(), cur_row_header.type(), 1, 100, false);
+                cur_row_header += tmpmat;
+                labels.at<int>(0, row_idx + 1 + j) = i;
+            }
+            row_idx += 1 + max_number;
+        }
+        labels.convertTo(labels, CV_32FC1);
+        cv::Size test_size = cv::Size(MWIDTH, 100);
+        samples.create(test_size, CV_32FC1);
+        labels_predict.create(1, test_size.height, CV_32SC1);
+        const int max_number_test = test_size.height / K - 1;
+        row_idx = 0;
+        for(int i = 0; i < K; i++ )
+        {
+            Mat center_row_header = samples.row(row_idx);
+            center_row_header.setTo(0);
+            int nchannel = center_row_header.channels();
+            for(int j = 0; j < nchannel; j++)
+            {
+                center_row_header.at<float>(0, i * nchannel + j) = 500.0;
+            }
+            labels_predict.at<int>(0, row_idx) = i;
+            for(int j = 0; (j < max_number_test) ||
+                    (i == K - 1 && j < max_number_test + test_size.height % K); j ++)
+            {
+                Mat cur_row_header = samples.row(row_idx + 1 + j);
+                center_row_header.copyTo(cur_row_header);
+                Mat tmpmat = randomMat(rng, cur_row_header.size(), cur_row_header.type(), 1, 100, false);
+                cur_row_header += tmpmat;
+                labels_predict.at<int>(0, row_idx + 1 + j) = i;
+            }
+            row_idx += 1 + max_number_test;
+        }
+        labels_predict.convertTo(labels_predict, CV_32FC1);
+    }
+};
+TEST_P(SVM_OCL, Accuracy)
+{
+    CvSVMParams params;
+    params.degree = 0.4;
+    params.gamma = 1;
+    params.coef0 = 1;
+    params.C = 1;
+    params.nu = 0.5;
+    params.p = 1;
+    params.svm_type = svm_type;
+    params.kernel_type = kernel_type;
+
+    params.term_crit = cvTermCriteria(CV_TERMCRIT_ITER, 1000, 0.001);
+
+    CvSVM SVM;
+    SVM.train(src, labels, Mat(), Mat(), params);
+
+    cv::ocl::CvSVM_OCL SVM_OCL;
+    SVM_OCL.train(src, labels, Mat(), Mat(), params);
+
+    int c = SVM.get_support_vector_count();
+    int c1 = SVM_OCL.get_support_vector_count();
+
+    Mat sv(c, MHEIGHT, CV_32FC1);
+    Mat sv_ocl(c1, MHEIGHT, CV_32FC1);
+    for(int i = 0; i < c; i++)
+    {
+        const float* v = SVM.get_support_vector(i);
+
+        for(int j = 0; j < MHEIGHT; j++)
+        {
+            sv.at<float>(i, j) = v[j];
+        }
+    }
+    for(int i = 0; i < c1; i++)
+    {
+        const float* v_ocl = SVM_OCL.get_support_vector(i);
+
+        for(int j = 0; j < MHEIGHT; j++)
+        {
+            sv_ocl.at<float>(i, j) = v_ocl[j];
+        }
+    }
+    cv::BFMatcher matcher(cv::NORM_L2);
+    std::vector<cv::DMatch> matches;
+    matcher.match(sv, sv_ocl, matches);
+    int count = 0;
+
+    for(std::vector<cv::DMatch>::iterator itr = matches.begin(); itr != matches.end(); itr++)
+    {
+        if((*itr).distance < 0.1)
+        {
+            count ++;
+        }
+    }
+    if(c != 0)
+    {
+        float matchedRatio = (float)count / c;
+        EXPECT_GT(matchedRatio, 0.95);
+    }
+    if(c != 0)
+    {
+        CvMat *result = cvCreateMat(1, samples.rows, CV_32FC1);
+        CvMat test_samples = samples;
+
+        CvMat *result_ocl = cvCreateMat(1, samples.rows, CV_32FC1);
+
+        SVM.predict(&test_samples, result);
+
+        SVM_OCL.predict(&test_samples, result_ocl);
+
+        int true_resp = 0, true_resp_ocl = 0;
+        for (int i = 0; i < samples.rows; i++)
+        {
+            if (result->data.fl[i] == labels_predict.at<float>(0, i))
+            {
+                true_resp++;
+            }
+        }
+        float matchedRatio = (float)true_resp / samples.rows;
+
+        for (int i = 0; i < samples.rows; i++)
+        {
+            if (result_ocl->data.fl[i] == labels_predict.at<float>(0, i))
+            {
+                true_resp_ocl++;
+            }
+        }
+        float matchedRatio_ocl = (float)true_resp_ocl / samples.rows;
+
+        if(matchedRatio != 0 && true_resp_ocl < true_resp)
+        {
+            EXPECT_NEAR(matchedRatio_ocl, matchedRatio, 0.03);
+        }
+    }
+}
+INSTANTIATE_TEST_CASE_P(OCL_ML, SVM_OCL, testing::Combine(
+                            Values(CvSVM::LINEAR, CvSVM::POLY, CvSVM::RBF, CvSVM::SIGMOID),
+                            Values(CvSVM::C_SVC, CvSVM::NU_SVC, CvSVM::ONE_CLASS, CvSVM::EPS_SVR, CvSVM::NU_SVR),
+                            Values(2, 3, 4)
+                        ));
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_precomp.hpp b/modules/ocl/test/test_precomp.hpp
index 72d5089d6..743fa9675 100644
--- a/modules/ocl/test/test_precomp.hpp
+++ b/modules/ocl/test/test_precomp.hpp
@@ -64,6 +64,7 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/imgproc.hpp"
+#include "opencv2/features2d.hpp"
 #include "opencv2/video.hpp"
 #include "opencv2/ocl.hpp"
 
diff --git a/modules/ocl/test/test_pyramids.cpp b/modules/ocl/test/test_pyramids.cpp
index 58179ac18..9070ee5aa 100644
--- a/modules/ocl/test/test_pyramids.cpp
+++ b/modules/ocl/test/test_pyramids.cpp
@@ -57,60 +57,63 @@ using namespace std;
 
 PARAM_TEST_CASE(PyrBase, MatType, int)
 {
-    int type;
+    int depth;
     int channels;
+
     Mat dst_cpu;
     oclMat gdst;
+
     virtual void SetUp()
     {
-        type = GET_PARAM(0);
+        depth = GET_PARAM(0);
         channels = GET_PARAM(1);
     }
-
 };
 
 /////////////////////// PyrDown //////////////////////////
-struct PyrDown : PyrBase {};
+
+typedef PyrBase PyrDown;
 
 TEST_P(PyrDown, Mat)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        Mat src = randomMat(size, CV_MAKETYPE(depth, channels));
         oclMat gsrc(src);
 
         pyrDown(src, dst_cpu);
         pyrDown(gsrc, gdst);
 
-        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f);
+        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), depth == CV_32F ? 1e-4f : 1.0f);
     }
 }
 
 INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
+                            Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                            Values(1, 3, 4)));
 
 /////////////////////// PyrUp //////////////////////////
 
-struct PyrUp : PyrBase {};
+typedef PyrBase PyrUp;
 
 TEST_P(PyrUp, Accuracy)
 {
-    for(int j = 0; j < LOOP_TIMES; j++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
         Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(type, channels));
+        Mat src = randomMat(size, CV_MAKETYPE(depth, channels));
         oclMat gsrc(src);
 
         pyrUp(src, dst_cpu);
         pyrUp(gsrc, gdst);
 
-        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (type == CV_32F ? 1e-4f : 1.0));
+        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (depth == CV_32F ? 1e-4f : 1.0));
     }
-
 }
 
 
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine(
-                            Values(CV_8U, CV_32F), Values(1, 3, 4)));
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, Combine(
+                            Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                            Values(1, 3, 4)));
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_split_merge.cpp b/modules/ocl/test/test_split_merge.cpp
index 9663f5321..52db49b02 100644
--- a/modules/ocl/test/test_split_merge.cpp
+++ b/modules/ocl/test/test_split_merge.cpp
@@ -52,39 +52,27 @@ using namespace cvtest;
 using namespace testing;
 using namespace std;
 
-PARAM_TEST_CASE(MergeTestBase, MatType, int)
+#define MAX_CHANNELS 4
+
+PARAM_TEST_CASE(MergeTestBase, MatType, int, bool)
 {
     int type;
     int channels;
+    bool use_roi;
 
     //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mat3;
-    cv::Mat mat4;
-
+    cv::Mat mat[MAX_CHANNELS];
     //dst mat
     cv::Mat dst;
 
     // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int src3x;
-    int src3y;
-    int src4x;
-    int src4y;
-    int dstx;
-    int dsty;
+    int roicols, roirows;
+    int srcx[MAX_CHANNELS];
+    int srcy[MAX_CHANNELS];
+    int dstx, dsty;
 
     //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mat3_roi;
-    cv::Mat mat4_roi;
+    cv::Mat mat_roi[MAX_CHANNELS];
 
     //dst mat with roi
     cv::Mat dst_roi;
@@ -93,78 +81,62 @@ PARAM_TEST_CASE(MergeTestBase, MatType, int)
     cv::ocl::oclMat gdst_whole;
 
     //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gmat3;
-    cv::ocl::oclMat gmat4;
+    cv::ocl::oclMat gmat[MAX_CHANNELS];
     cv::ocl::oclMat gdst;
 
     virtual void SetUp()
     {
         type = GET_PARAM(0);
         channels = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
 
         cv::RNG &rng = TS::ptr()->get_rng();
         cv::Size size(MWIDTH, MHEIGHT);
 
-        mat1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-
+        for (int i = 0; i < channels; ++i)
+            mat[i] = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+        dst = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
     }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        src3x   = rng.uniform(0, mat3.cols - roicols);
-        src3y   = rng.uniform(0, mat3.rows - roirows);
-        src4x   = rng.uniform(0, mat4.cols - roicols);
-        src4y   = rng.uniform(0, mat4.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-#else
-        roicols = mat1.cols;
-        roirows = mat1.rows;
-        src1x   = 0;
-        src1y   = 0;
-        src2x   = 0;
-        src2y   = 0;
-        src3x   = 0;
-        src3y   = 0;
-        src4x   = 0;
-        src4y   = 0;
-        dstx    = 0;
-        dsty    = 0;
-#endif
+        if (use_roi)
+        {
+            //randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+            roicols = rng.uniform(1, mat[0].cols);
+            roirows = rng.uniform(1, mat[0].rows);
 
+            for (int i = 0; i < channels; ++i)
+            {
+                srcx[i] = rng.uniform(0, mat[i].cols - roicols);
+                srcy[i] = rng.uniform(0, mat[i].rows - roirows);
+            }
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mat3_roi = mat3(Rect(src3x, src3y, roicols, roirows));
-        mat4_roi = mat4(Rect(src4x, src4y, roicols, roirows));
+            dstx = rng.uniform(0, dst.cols  - roicols);
+            dsty = rng.uniform(0, dst.rows  - roirows);
+        }
+        else
+        {
+            roicols = mat[0].cols;
+            roirows = mat[0].rows;
+            for (int i = 0; i < channels; ++i)
+                srcx[i] = srcy[i] = 0;
 
+            dstx = dsty = 0;
+        }
+
+        for (int i = 0; i < channels; ++i)
+            mat_roi[i] = mat[i](Rect(srcx[i], srcy[i], roicols, roirows));
 
         dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
 
         gdst_whole = dst;
         gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmat3 = mat3_roi;
-        gmat4 = mat4_roi;
+        for (int i = 0; i < channels; ++i)
+            gmat[i] = mat_roi[i];
     }
-
 };
 
 struct Merge : MergeTestBase {};
@@ -175,159 +147,97 @@ TEST_P(Merge, Accuracy)
     {
         random_roi();
 
-        std::vector<cv::Mat> dev_src;
-        dev_src.push_back(mat1_roi);
-
-        if(channels >= 2)
-            dev_src.push_back(mat2_roi);
-
-        if(channels >= 3)
-            dev_src.push_back(mat3_roi);
-
-        if(channels >= 4)
-            dev_src.push_back(mat4_roi);
-
-        std::vector<cv::ocl::oclMat> dev_gsrc;
-        dev_gsrc.push_back(gmat1);
-
-        if(channels >= 2)
-            dev_gsrc.push_back(gmat2);
-
-        if(channels >= 3)
-            dev_gsrc.push_back(gmat3);
-
-        if(channels >= 4)
-            dev_gsrc.push_back(gmat4);
-
-        cv::merge(dev_src, dst_roi);
-        cv::ocl::merge(dev_gsrc, gdst);
+        cv::merge(mat_roi, channels, dst_roi);
+        cv::ocl::merge(gmat, channels, gdst);
 
         EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0);
     }
 }
 
-
-
-PARAM_TEST_CASE(SplitTestBase, MatType, int)
+PARAM_TEST_CASE(SplitTestBase, MatType, int, bool)
 {
     int type;
     int channels;
+    bool use_roi;
 
     //src mat
     cv::Mat mat;
 
     //dstmat
-    cv::Mat dst1;
-    cv::Mat dst2;
-    cv::Mat dst3;
-    cv::Mat dst4;
+    cv::Mat dst[MAX_CHANNELS];
 
     // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dst1x;
-    int dst1y;
-    int dst2x;
-    int dst2y;
-    int dst3x;
-    int dst3y;
-    int dst4x;
-    int dst4y;
+    int roicols, roirows;
+    int srcx, srcy;
+    int dstx[MAX_CHANNELS];
+    int dsty[MAX_CHANNELS];
 
     //src mat with roi
     cv::Mat mat_roi;
 
     //dst mat with roi
-    cv::Mat dst1_roi;
-    cv::Mat dst2_roi;
-    cv::Mat dst3_roi;
-    cv::Mat dst4_roi;
+    cv::Mat dst_roi[MAX_CHANNELS];
 
     //ocl dst mat for testing
-    cv::ocl::oclMat gdst1_whole;
-    cv::ocl::oclMat gdst2_whole;
-    cv::ocl::oclMat gdst3_whole;
-    cv::ocl::oclMat gdst4_whole;
+    cv::ocl::oclMat gdst_whole[MAX_CHANNELS];
 
     //ocl mat with roi
     cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst1;
-    cv::ocl::oclMat gdst2;
-    cv::ocl::oclMat gdst3;
-    cv::ocl::oclMat gdst4;
+    cv::ocl::oclMat gdst[MAX_CHANNELS];
 
     virtual void SetUp()
     {
         type = GET_PARAM(0);
         channels = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
 
         cv::RNG &rng = TS::ptr()->get_rng();
         cv::Size size(MWIDTH, MHEIGHT);
 
         mat  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        dst1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-
-    }
+        for (int i = 0; i < channels; ++i)
+            dst[i] = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);    }
 
     void random_roi()
     {
-#ifdef RANDOMROI
-        //randomize ROI
-        cv::RNG &rng = TS::ptr()->get_rng();
-        roicols = rng.uniform(1, mat.cols);
-        roirows = rng.uniform(1, mat.rows);
-        srcx    = rng.uniform(0, mat.cols - roicols);
-        srcy    = rng.uniform(0, mat.rows - roirows);
-        dst1x   = rng.uniform(0, dst1.cols  - roicols);
-        dst1y   = rng.uniform(0, dst1.rows  - roirows);
-        dst2x   = rng.uniform(0, dst2.cols  - roicols);
-        dst2y   = rng.uniform(0, dst2.rows  - roirows);
-        dst3x   = rng.uniform(0, dst3.cols  - roicols);
-        dst3y   = rng.uniform(0, dst3.rows  - roirows);
-        dst4x   = rng.uniform(0, dst4.cols  - roicols);
-        dst4y   = rng.uniform(0, dst4.rows  - roirows);
-#else
-        roicols = mat.cols;
-        roirows = mat.rows;
-        srcx    = 0;
-        srcy    = 0;
-        dst1x   = 0;
-        dst1y   = 0;
-        dst2x   = 0;
-        dst2y   = 0;
-        dst3x   = 0;
-        dst3y   = 0;
-        dst4x   = 0;
-        dst4y   = 0;
-#endif
+        if (use_roi)
+        {
+            //randomize ROI
+            cv::RNG &rng = TS::ptr()->get_rng();
+            roicols = rng.uniform(1, mat.cols);
+            roirows = rng.uniform(1, mat.rows);
+            srcx    = rng.uniform(0, mat.cols - roicols);
+            srcy    = rng.uniform(0, mat.rows - roirows);
+
+            for (int i = 0; i < channels; ++i)
+            {
+                dstx[i] = rng.uniform(0, dst[i].cols  - roicols);
+                dsty[i] = rng.uniform(0, dst[i].rows  - roirows);
+            }
+        }
+        else
+        {
+            roicols = mat.cols;
+            roirows = mat.rows;
+            srcx = srcy = 0;
+
+            for (int i = 0; i < channels; ++i)
+                dstx[i] = dsty[i] = 0;
+        }
 
         mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
 
-        dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-        dst2_roi = dst2(Rect(dst2x, dst2y, roicols, roirows));
-        dst3_roi = dst3(Rect(dst3x, dst3y, roicols, roirows));
-        dst4_roi = dst4(Rect(dst4x, dst4y, roicols, roirows));
+        for (int i = 0; i < channels; ++i)
+            dst_roi[i] = dst[i](Rect(dstx[i], dsty[i], roicols, roirows));
 
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
-
-        gdst2_whole = dst2;
-        gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
-
-        gdst3_whole = dst3;
-        gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
-
-        gdst4_whole = dst4;
-        gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
+        for (int i = 0; i < channels; ++i)
+        {
+            gdst_whole[i] = dst[i];
+            gdst[i] = gdst_whole[i](Rect(dstx[i], dsty[i], roicols, roirows));
+        }
 
         gmat = mat_roi;
     }
-
 };
 
 struct Split : SplitTestBase {};
@@ -338,33 +248,21 @@ TEST_P(Split, Accuracy)
     {
         random_roi();
 
-        cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-        cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
+        cv::split(mat_roi, dst_roi);
+        cv::ocl::split(gmat, gdst);
 
-        cv::split(mat_roi, dev_dst);
-        cv::ocl::split(gmat, dev_gdst);
-
-        if(channels >= 1)
-            EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), 0.0);
-
-        if(channels >= 2)
-            EXPECT_MAT_NEAR(dst2, Mat(gdst2_whole), 0.0);
-
-        if(channels >= 3)
-            EXPECT_MAT_NEAR(dst3, Mat(gdst3_whole), 0.0);
-
-        if(channels >= 4)
-            EXPECT_MAT_NEAR(dst4, Mat(gdst4_whole), 0.0);
+        for (int i = 0; i < channels; ++i)
+            EXPECT_MAT_NEAR(dst[i], Mat(gdst_whole[i]), 0.0);
     }
 }
 
 
 INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(
-                            Values(CV_8U, CV_32S, CV_32F), Values(1, 3, 4)));
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F), Range(1, 5), Bool()));
 
 
 INSTANTIATE_TEST_CASE_P(SplitMerge, Split , Combine(
-                            Values(CV_8U, CV_32S, CV_32F), Values(1, 3, 4)));
+                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F), Range(1, 5), Bool()));
 
 
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
index ade7620b0..3494c6f9f 100644
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -43,9 +43,15 @@
 #define __OPENCV_TEST_UTILITY_HPP__
 #include "opencv2/core.hpp"
 
+
 #define LOOP_TIMES 1
+
 #define MWIDTH 256
 #define MHEIGHT 256
+
+#define MIN_VALUE 171
+#define MAX_VALUE 357
+
 //#define RANDOMROI
 int randomInt(int minVal, int maxVal);
 double randomDouble(double minVal, double maxVal);
@@ -75,6 +81,7 @@ double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
 //oclMat create
 cv::ocl::oclMat createMat_ocl(cv::Size size, int type, bool useRoi = false);
 cv::ocl::oclMat loadMat_ocl(const cv::Mat& m, bool useRoi = false);
+
 #define EXPECT_MAT_NORM(mat, eps) \
 { \
     EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
@@ -86,14 +93,7 @@ cv::ocl::oclMat loadMat_ocl(const cv::Mat& m, bool useRoi = false);
    ASSERT_EQ(mat1.size(), mat2.size()); \
    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
 }
-/*
-#define EXPECT_MAT_NEAR(mat1, mat2, eps,s) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps)<<s; \
-}
-*/
+
 #define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
 { \
     ASSERT_EQ(mat1.type(), mat2.type()); \
diff --git a/platforms/scripts/ABI_compat_generator.py b/platforms/scripts/ABI_compat_generator.py
index d7cc3728a..c518589d9 100755
--- a/platforms/scripts/ABI_compat_generator.py
+++ b/platforms/scripts/ABI_compat_generator.py
@@ -47,7 +47,7 @@ def GetClasses(root, prefix):
 
 
 def GetJavaHHeaders():
-    print('\nGenerating JNI headers for Java API ...')
+    print('Generating JNI headers for Java API ...')
 
     javahHeaders = os.path.join(managerDir, 'javah_generated_headers')
     if os.path.exists(javahHeaders):
@@ -70,7 +70,7 @@ def GetJavaHHeaders():
         os.system('javah -d %s -classpath %s:%s %s' % (javahHeaders, classPath, \
             AndroidJavaDeps, currentClass))
 
-    print('\nBuilding JNI headers list ...')
+    print('Building JNI headers list ...')
     jniHeaders = GetHeaderFiles(javahHeaders)
 
     return jniHeaders
@@ -94,10 +94,10 @@ def GetOpenCVModules():
 
 
 
-def FindHeaders():
+def FindHeaders(includeJni):
     headers = []
 
-    print('\nBuilding Native OpenCV header list ...')
+    print('Building Native OpenCV header list ...')
 
     cppHeadersFolder = os.path.join(managerDir, 'sdk/native/jni/include/opencv2')
 
@@ -124,7 +124,8 @@ def FindHeaders():
         'sdk/native/jni/include/opencv'))
     headers += cHeaders
 
-    headers += GetJavaHHeaders()
+    if (includeJni):
+        headers += GetJavaHHeaders()
 
     return headers
 
@@ -198,24 +199,32 @@ def WriteXml(version, headers, includes, libraries):
 
 
 if __name__ == '__main__':
-    usage = '%prog <OpenCV_Manager install directory> <OpenCV_Manager version>'
+    usage = '%prog [options] <OpenCV_Manager install directory> <OpenCV_Manager version>'
     parser = OptionParser(usage = usage)
+    parser.add_option('--exclude-jni', dest='excludeJni', action="store_true", default=False, metavar="EXCLUDE_JNI", help='Exclude headers for all JNI functions')
+    parser.add_option('--sdk', dest='sdk', default='~/NVPACK/android-sdk-linux', metavar="PATH", help='Android SDK path')
+    parser.add_option('--ndk', dest='ndk', default='/opt/android-ndk-r8c', metavar="PATH", help='Android NDK path')
+    parser.add_option('--java-api-level', dest='java_api_level', default='14', metavar="JAVA_API_LEVEL", help='Java API level for generating JNI headers')
+
+    (options, args) = parser.parse_args()
 
-    args = parser.parse_args()
     if 2 != len(args):
         parser.print_help()
         quit()
 
-    managerDir = args[1][0]
-    version = args[1][1]
+    managerDir = args[0]
+    version = args[1]
 
-    NDK_path = '/opt/android-ndk-r8c'
-    print '\nUsing Android NDK from "%s"' % NDK_path
+    include_jni = not options.excludeJni
+    print 'Include Jni headers: %s' % (include_jni)
 
-    SDK_path = '~/NVPACK/android-sdk-linux'
-    print '\nUsing Android SDK from "%s"' % SDK_path
+    NDK_path = options.ndk
+    print 'Using Android NDK from "%s"' % NDK_path
 
-    headers = FindHeaders()
+    SDK_path = options.sdk
+    print 'Using Android SDK from "%s"' % SDK_path
+
+    headers = FindHeaders(include_jni)
 
     includes = FindIncludes()
 
diff --git a/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp b/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
similarity index 98%
rename from samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp
rename to samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
index 539115a99..e2d2a5e18 100644
--- a/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp
+++ b/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
@@ -17,10 +17,10 @@
 static void help(std::string errorMessage)
 {
     std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : ./OpenEXRimages_HighDynamicRange_Retina_toneMapping [OpenEXR image to process]"<<std::endl;
+    std::cout<<"\nProgram call procedure : ./OpenEXRimages_HDR_Retina_toneMapping [OpenEXR image to process]"<<std::endl;
     std::cout<<"\t[OpenEXR image to process] : the input HDR image to process, must be an OpenEXR format, see http://www.openexr.com/ to get some samples or create your own using camera bracketing and Photoshop or equivalent software for OpenEXR image synthesis"<<std::endl;
     std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./OpenEXRimages_HighDynamicRange_Retina_toneMapping memorial.exr"<<std::endl;
+    std::cout<<"\t-Image processing : ./OpenEXRimages_HDR_Retina_toneMapping memorial.exr"<<std::endl;
 }
 
 // simple procedure for 1D curve tracing
diff --git a/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping_video.cpp b/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp
similarity index 97%
rename from samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping_video.cpp
rename to samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp
index 646b1b7b7..9890feee9 100644
--- a/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping_video.cpp
+++ b/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp
@@ -1,6 +1,6 @@
 
 //============================================================================
-// Name        : OpenEXRimages_HighDynamicRange_Retina_toneMapping_video.cpp
+// Name        : OpenEXRimages_HDR_Retina_toneMapping_video.cpp
 // Author      : Alexandre Benoit (benoit.alexandre.vision@gmail.com)
 // Version     : 0.2
 // Copyright   : Alexandre Benoit, LISTIC Lab, december 2011
@@ -21,14 +21,14 @@
 static void help(std::string errorMessage)
 {
     std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : ./OpenEXRimages_HighDynamicRange_Retina_toneMapping [OpenEXR image sequence to process] [OPTIONNAL start frame] [OPTIONNAL end frame]"<<std::endl;
+    std::cout<<"\nProgram call procedure : ./OpenEXRimages_HDR_Retina_toneMapping [OpenEXR image sequence to process] [OPTIONNAL start frame] [OPTIONNAL end frame]"<<std::endl;
     std::cout<<"\t[OpenEXR image sequence to process] : std::sprintf style ready prototype filename of the input HDR images to process, must be an OpenEXR format, see http://www.openexr.com/ to get some samples or create your own using camera bracketing and Photoshop or equivalent software for OpenEXR image synthesis"<<std::endl;
     std::cout<<"\t\t => WARNING : image index number of digits cannot exceed 10"<<std::endl;
     std::cout<<"\t[start frame] : the starting frame tat should be considered"<<std::endl;
     std::cout<<"\t[end frame] : the ending frame tat should be considered"<<std::endl;
     std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./OpenEXRimages_HighDynamicRange_Retina_toneMapping_video memorial%3d.exr 20 45"<<std::endl;
-    std::cout<<"\t-Image processing : ./OpenEXRimages_HighDynamicRange_Retina_toneMapping_video memorial%3d.exr 20 45 log"<<std::endl;
+    std::cout<<"\t-Image processing : ./OpenEXRimages_HDR_Retina_toneMapping_video memorial%3d.exr 20 45"<<std::endl;
+    std::cout<<"\t-Image processing : ./OpenEXRimages_HDR_Retina_toneMapping_video memorial%3d.exr 20 45 log"<<std::endl;
     std::cout<<"\t ==> to process images from memorial020d.exr to memorial045d.exr"<<std::endl;
 
 }
diff --git a/samples/cpp/latentsvm_multidetect.cpp b/samples/cpp/latentsvm_multidetect.cpp
index f5a8bc56d..d6ea4d5aa 100644
--- a/samples/cpp/latentsvm_multidetect.cpp
+++ b/samples/cpp/latentsvm_multidetect.cpp
@@ -88,6 +88,8 @@ static void readDirectory( const string& directoryName, vector<String>& filename
             else
                 filenames.push_back( string(dent->d_name) );
         }
+
+        closedir( dir );
     }
 #endif