diff --git a/3rdparty/jinja2/markupsafe/__init__.py b/3rdparty/jinja2/markupsafe/__init__.py
index f13d7070d..a602dd555 100644
--- a/3rdparty/jinja2/markupsafe/__init__.py
+++ b/3rdparty/jinja2/markupsafe/__init__.py
@@ -9,7 +9,7 @@
     :license: BSD, see LICENSE for more details.
 """
 import re
-from _compat import text_type, string_types, int_types, \
+from ._compat import text_type, string_types, int_types, \
      unichr, PY2
 
 
@@ -227,7 +227,7 @@ class _MarkupEscapeHelper(object):
 try:
     from _speedups import escape, escape_silent, soft_unicode
 except ImportError:
-    from _native import escape, escape_silent, soft_unicode
+    from ._native import escape, escape_silent, soft_unicode
 
 if not PY2:
     soft_str = soft_unicode
diff --git a/3rdparty/jinja2/markupsafe/_native.py b/3rdparty/jinja2/markupsafe/_native.py
index 4b4aee389..81d0777d1 100644
--- a/3rdparty/jinja2/markupsafe/_native.py
+++ b/3rdparty/jinja2/markupsafe/_native.py
@@ -8,7 +8,7 @@
     :copyright: (c) 2010 by Armin Ronacher.
     :license: BSD, see LICENSE for more details.
 """
-from _compat import text_type
+from ._compat import text_type
 
 
 def escape(s):
diff --git a/3rdparty/jinja2/utils.py b/3rdparty/jinja2/utils.py
index ddc47da0a..cbea660b4 100644
--- a/3rdparty/jinja2/utils.py
+++ b/3rdparty/jinja2/utils.py
@@ -517,4 +517,4 @@ class Joiner(object):
 
 
 # Imported here because that's where it was in the past
-from markupsafe import Markup, escape, soft_unicode
+from .markupsafe import Markup, escape, soft_unicode
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
index 5b618a874..aac6634b4 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
index 846fc88bd..d523f69de 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
index 80bf459cc..e386bf4f9 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
index e5cc7d296..028ab7d1e 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
index d3cf3b124..48cbdd096 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
index 6498151ba..7fe50875c 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
index 58bef3455..15827d818 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.3.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.3.0.so
index ce69b52ea..ec1edfb04 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.3.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.3.0.so differ
diff --git a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.4.0.so b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.4.0.so
index 3e65fb171..4d777edf8 100755
Binary files a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.4.0.so and b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.4.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
index 68805b589..1707a8850 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so and b/3rdparty/lib/armeabi/libnative_camera_r2.2.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
index 88ac3f7e3..fb4b125fd 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so and b/3rdparty/lib/armeabi/libnative_camera_r2.3.3.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
index fa41cb250..96b264d0e 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so and b/3rdparty/lib/armeabi/libnative_camera_r3.0.1.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
index a305c2b00..179eef9a9 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.0.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
index 8c34357cc..165dc463c 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so and b/3rdparty/lib/armeabi/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
index a01ee15e2..a9a5d7da7 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so and b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
index a8ff89465..9037c6860 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.3.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.3.0.so
index aa1cfd844..026f0b48b 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.3.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.3.0.so differ
diff --git a/3rdparty/lib/armeabi/libnative_camera_r4.4.0.so b/3rdparty/lib/armeabi/libnative_camera_r4.4.0.so
index 264f6f217..6aebec923 100755
Binary files a/3rdparty/lib/armeabi/libnative_camera_r4.4.0.so and b/3rdparty/lib/armeabi/libnative_camera_r4.4.0.so differ
diff --git a/3rdparty/lib/mips/libnative_camera_r4.0.3.so b/3rdparty/lib/mips/libnative_camera_r4.0.3.so
index 14dfaf23b..6dee89780 100755
Binary files a/3rdparty/lib/mips/libnative_camera_r4.0.3.so and b/3rdparty/lib/mips/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/mips/libnative_camera_r4.1.1.so b/3rdparty/lib/mips/libnative_camera_r4.1.1.so
index a37474256..71a6354ac 100755
Binary files a/3rdparty/lib/mips/libnative_camera_r4.1.1.so and b/3rdparty/lib/mips/libnative_camera_r4.1.1.so differ
diff --git a/3rdparty/lib/mips/libnative_camera_r4.2.0.so b/3rdparty/lib/mips/libnative_camera_r4.2.0.so
index 31cbb3a99..21bcffb4a 100755
Binary files a/3rdparty/lib/mips/libnative_camera_r4.2.0.so and b/3rdparty/lib/mips/libnative_camera_r4.2.0.so differ
diff --git a/3rdparty/lib/mips/libnative_camera_r4.3.0.so b/3rdparty/lib/mips/libnative_camera_r4.3.0.so
index 379fc7003..653c2f1ca 100755
Binary files a/3rdparty/lib/mips/libnative_camera_r4.3.0.so and b/3rdparty/lib/mips/libnative_camera_r4.3.0.so differ
diff --git a/3rdparty/lib/mips/libnative_camera_r4.4.0.so b/3rdparty/lib/mips/libnative_camera_r4.4.0.so
index 0f6c83713..8d6fdf2bc 100755
Binary files a/3rdparty/lib/mips/libnative_camera_r4.4.0.so and b/3rdparty/lib/mips/libnative_camera_r4.4.0.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r2.3.3.so b/3rdparty/lib/x86/libnative_camera_r2.3.3.so
index 5c46b1607..a47b8b2ce 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r2.3.3.so and b/3rdparty/lib/x86/libnative_camera_r2.3.3.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r3.0.1.so b/3rdparty/lib/x86/libnative_camera_r3.0.1.so
index 77512e5de..faa13461f 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r3.0.1.so and b/3rdparty/lib/x86/libnative_camera_r3.0.1.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.0.3.so b/3rdparty/lib/x86/libnative_camera_r4.0.3.so
index b5de08299..2d2fb8eb1 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r4.0.3.so and b/3rdparty/lib/x86/libnative_camera_r4.0.3.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.1.1.so b/3rdparty/lib/x86/libnative_camera_r4.1.1.so
index 867137410..f40da0d9d 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r4.1.1.so and b/3rdparty/lib/x86/libnative_camera_r4.1.1.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.2.0.so b/3rdparty/lib/x86/libnative_camera_r4.2.0.so
index 52e9a5792..0d4ac03b5 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r4.2.0.so and b/3rdparty/lib/x86/libnative_camera_r4.2.0.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.3.0.so b/3rdparty/lib/x86/libnative_camera_r4.3.0.so
index af898ccad..7e1c5803a 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r4.3.0.so and b/3rdparty/lib/x86/libnative_camera_r4.3.0.so differ
diff --git a/3rdparty/lib/x86/libnative_camera_r4.4.0.so b/3rdparty/lib/x86/libnative_camera_r4.4.0.so
index 108862f56..37ab6d080 100755
Binary files a/3rdparty/lib/x86/libnative_camera_r4.4.0.so and b/3rdparty/lib/x86/libnative_camera_r4.4.0.so differ
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d4704779..442edf32c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -438,7 +438,6 @@ include(cmake/OpenCVFindLibsGUI.cmake)
 include(cmake/OpenCVFindLibsVideo.cmake)
 include(cmake/OpenCVFindLibsPerf.cmake)
 
-
 # ----------------------------------------------------------------------------
 #  Detect other 3rd-party libraries/tools
 # ----------------------------------------------------------------------------
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index 89602acaa..2685171bb 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -217,3 +217,42 @@ else()
   unset(CUDA_ARCH_BIN CACHE)
   unset(CUDA_ARCH_PTX CACHE)
 endif()
+
+if(HAVE_CUDA)
+  set(CUDA_LIBS_PATH "")
+  foreach(p ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+    get_filename_component(_tmp ${p} PATH)
+    list(APPEND CUDA_LIBS_PATH ${_tmp})
+  endforeach()
+
+  if(HAVE_CUBLAS)
+    foreach(p ${CUDA_cublas_LIBRARY})
+      get_filename_component(_tmp ${p} PATH)
+      list(APPEND CUDA_LIBS_PATH ${_tmp})
+    endforeach()
+  endif()
+
+  if(HAVE_CUFFT)
+    foreach(p ${CUDA_cufft_LIBRARY})
+      get_filename_component(_tmp ${p} PATH)
+      list(APPEND CUDA_LIBS_PATH ${_tmp})
+    endforeach()
+  endif()
+
+  list(REMOVE_DUPLICATES CUDA_LIBS_PATH)
+  link_directories(${CUDA_LIBS_PATH})
+
+  set(CUDA_LIBRARIES_ABS ${CUDA_LIBRARIES})
+  ocv_convert_to_lib_name(CUDA_LIBRARIES ${CUDA_LIBRARIES})
+  set(CUDA_npp_LIBRARY_ABS ${CUDA_npp_LIBRARY})
+  ocv_convert_to_lib_name(CUDA_npp_LIBRARY ${CUDA_npp_LIBRARY})
+  if(HAVE_CUBLAS)
+    set(CUDA_cublas_LIBRARY_ABS ${CUDA_cublas_LIBRARY})
+    ocv_convert_to_lib_name(CUDA_cublas_LIBRARY ${CUDA_cublas_LIBRARY})
+  endif()
+
+  if(HAVE_CUFFT)
+    set(CUDA_cufft_LIBRARY_ABS ${CUDA_cufft_LIBRARY})
+    ocv_convert_to_lib_name(CUDA_cufft_LIBRARY ${CUDA_cufft_LIBRARY})
+  endif()
+endif()
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 372d4504c..e6fa19911 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -27,7 +27,8 @@
 # The verbose template for OpenCV module:
 #
 #   ocv_add_module(modname <dependencies>)
-#   ocv_glob_module_sources() or glob them manually and ocv_set_module_sources(...)
+#   ocv_glob_module_sources(([EXCLUDE_CUDA] <extra sources&headers>)
+#                          or glob them manually and ocv_set_module_sources(...)
 #   ocv_module_include_directories(<extra include directories>)
 #   ocv_create_module()
 #   <add extra link dependencies, compiler options, etc>
@@ -478,9 +479,15 @@ endmacro()
 
 # finds and sets headers and sources for the standard OpenCV module
 # Usage:
-# ocv_glob_module_sources(<extra sources&headers in the same format as used in ocv_set_module_sources>)
+# ocv_glob_module_sources([EXCLUDE_CUDA] <extra sources&headers in the same format as used in ocv_set_module_sources>)
 macro(ocv_glob_module_sources)
-  file(GLOB_RECURSE lib_srcs     "src/*.cpp")
+  set(_argn ${ARGN})
+  list(FIND _argn "EXCLUDE_CUDA" exclude_cuda)
+  if(NOT exclude_cuda EQUAL -1)
+    list(REMOVE_AT _argn ${exclude_cuda})
+  endif()
+
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
   file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
   file(GLOB lib_hdrs     "include/opencv2/*.hpp" "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
   file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
@@ -492,15 +499,21 @@ macro(ocv_glob_module_sources)
   ocv_source_group("Src" DIRBASE "${CMAKE_CURRENT_SOURCE_DIR}/src" FILES ${lib_srcs} ${lib_int_hdrs})
   ocv_source_group("Include" DIRBASE "${CMAKE_CURRENT_SOURCE_DIR}/include" FILES ${lib_hdrs} ${lib_hdrs_detail})
 
-  file(GLOB lib_cuda_srcs "src/cuda/*.cu")
-  set(cuda_objs "")
-  set(lib_cuda_hdrs "")
-  if(HAVE_CUDA AND lib_cuda_srcs)
-    ocv_include_directories(${CUDA_INCLUDE_DIRS})
-    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
+  if (exclude_cuda EQUAL -1)
+    file(GLOB lib_cuda_srcs "src/cuda/*.cu")
+    set(cuda_objs "")
+    set(lib_cuda_hdrs "")
+    if(HAVE_CUDA)
+      ocv_include_directories(${CUDA_INCLUDE_DIRS})
+      file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
 
-    ocv_cuda_compile(cuda_objs ${lib_cuda_srcs} ${lib_cuda_hdrs})
-    source_group("Src\\Cuda" FILES ${lib_cuda_srcs} ${lib_cuda_hdrs})
+      ocv_cuda_compile(cuda_objs ${lib_cuda_srcs} ${lib_cuda_hdrs})
+      source_group("Src\\Cuda"      FILES ${lib_cuda_srcs} ${lib_cuda_hdrs})
+    endif()
+  else()
+    set(cuda_objs "")
+    set(lib_cuda_srcs "")
+    set(lib_cuda_hdrs "")
   endif()
 
   file(GLOB cl_kernels "src/opencl/*.cl")
@@ -516,8 +529,8 @@ macro(ocv_glob_module_sources)
     list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
   endif()
 
-  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
-                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_srcs} ${lib_cuda_hdrs})
+  ocv_set_module_sources(${_argn} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+                         SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_srcs} ${lib_cuda_hdrs})
 endmacro()
 
 # creates OpenCV module in current folder
@@ -622,11 +635,20 @@ endmacro()
 # short command for adding simple OpenCV module
 # see ocv_add_module for argument details
 # Usage:
-# ocv_define_module(module_name  [INTERNAL] [REQUIRED] [<list of dependencies>] [OPTIONAL <list of optional dependencies>])
+# ocv_define_module(module_name  [INTERNAL] [EXCLUDE_CUDA] [REQUIRED] [<list of dependencies>] [OPTIONAL <list of optional dependencies>])
 macro(ocv_define_module module_name)
-  ocv_add_module(${module_name} ${ARGN})
+  set(_argn ${ARGN})
+  set(exclude_cuda "")
+  foreach(arg ${_argn})
+    if("${arg}" STREQUAL "EXCLUDE_CUDA")
+      set(exclude_cuda "${arg}")
+      list(REMOVE_ITEM _argn ${arg})
+    endif()
+  endforeach()
+
+  ocv_add_module(${module_name} ${_argn})
   ocv_module_include_directories()
-  ocv_glob_module_sources()
+  ocv_glob_module_sources(${exclude_cuda})
   ocv_create_module()
   ocv_add_precompiled_headers(${the_module})
 
diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index 88eed8ee0..7dee995f5 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -19,8 +19,8 @@
 #    This file will define the following variables:
 #      - OpenCV_LIBS                     : The list of all imported targets for OpenCV modules.
 #      - OpenCV_INCLUDE_DIRS             : The OpenCV include directories.
-#      - OpenCV_COMPUTE_CAPABILITIES     : The version of compute capability
-#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API
+#      - OpenCV_COMPUTE_CAPABILITIES     : The version of compute capability.
+#      - OpenCV_ANDROID_NATIVE_API_LEVEL : Minimum required level of Android API.
 #      - OpenCV_VERSION                  : The version of this OpenCV build: "@OPENCV_VERSION_PLAIN@"
 #      - OpenCV_VERSION_MAJOR            : Major version part of OpenCV_VERSION: "@OPENCV_VERSION_MAJOR@"
 #      - OpenCV_VERSION_MINOR            : Minor version part of OpenCV_VERSION: "@OPENCV_VERSION_MINOR@"
@@ -28,25 +28,29 @@
 #      - OpenCV_VERSION_STATUS           : Development status of this build: "@OPENCV_VERSION_STATUS@"
 #
 #    Advanced variables:
-#      - OpenCV_SHARED
-#      - OpenCV_CONFIG_PATH
-#      - OpenCV_INSTALL_PATH  (not set on Windows)
-#      - OpenCV_LIB_COMPONENTS
-#      - OpenCV_USE_MANGLED_PATHS
-#      - OpenCV_HAVE_ANDROID_CAMERA
+#      - OpenCV_SHARED                   : Use OpenCV as shared library
+#      - OpenCV_CONFIG_PATH              : Path to this OpenCVConfig.cmake
+#      - OpenCV_INSTALL_PATH             : OpenCV location (not set on Windows)
+#      - OpenCV_LIB_COMPONENTS           : Present OpenCV modules list
+#      - OpenCV_USE_MANGLED_PATHS        : Mangled OpenCV path flag
+#      - OpenCV_MODULES_SUFFIX           : The suffix for OpenCVModules-XXX.cmake file
+#      - OpenCV_HAVE_ANDROID_CAMERA      : Presence of Android native camera wrappers
 #
 #    Deprecated variables:
 #      - OpenCV_VERSION_TWEAK            : Always "0"
 #
 # ===================================================================================
 
-set(modules_file_suffix "")
-if(ANDROID)
-  string(REPLACE - _ modules_file_suffix "_${ANDROID_NDK_ABI_NAME}")
+if(NOT DEFINED OpenCV_MODULES_SUFFIX)
+  if(ANDROID)
+    string(REPLACE - _ OpenCV_MODULES_SUFFIX "_${ANDROID_NDK_ABI_NAME}")
+  else()
+    set(OpenCV_MODULES_SUFFIX "")
+  endif()
 endif()
 
 if(NOT TARGET opencv_core)
-  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${modules_file_suffix}.cmake)
+  include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake)
 endif()
 
 # TODO All things below should be reviewed. What is about of moving this code into related modules (special vars/hooks/files)
@@ -209,7 +213,7 @@ foreach(__opttype OPT DBG)
   SET(OpenCV_EXTRA_LIBS_${__opttype} "")
 
   # CUDA
-  if(OpenCV_CUDA_VERSION AND (CMAKE_CROSSCOMPILING OR (WIN32 AND NOT OpenCV_SHARED)))
+  if(OpenCV_CUDA_VERSION)
     if(NOT CUDA_FOUND)
       find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
     else()
@@ -218,32 +222,41 @@ foreach(__opttype OPT DBG)
       endif()
     endif()
 
-    list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_LIBRARIES})
+    set(OpenCV_CUDA_LIBS_ABSPATH ${CUDA_LIBRARIES})
 
     if(${CUDA_VERSION} VERSION_LESS "5.5")
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_npp_LIBRARY})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_npp_LIBRARY})
     else()
       find_cuda_helper_libs(nppc)
       find_cuda_helper_libs(nppi)
       find_cuda_helper_libs(npps)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
     endif()
 
     if(OpenCV_USE_CUBLAS)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUBLAS_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUBLAS_LIBRARIES})
     endif()
 
     if(OpenCV_USE_CUFFT)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_CUFFT_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_CUFFT_LIBRARIES})
     endif()
 
     if(OpenCV_USE_NVCUVID)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvid_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvid_LIBRARIES})
     endif()
 
     if(WIN32)
-      list(APPEND OpenCV_EXTRA_LIBS_${__opttype} ${CUDA_nvcuvenc_LIBRARIES})
+      list(APPEND OpenCV_CUDA_LIBS_ABSPATH ${CUDA_nvcuvenc_LIBRARIES})
     endif()
+
+    set(OpenCV_CUDA_LIBS_RELPATH "")
+    foreach(l ${OpenCV_CUDA_LIBS_ABSPATH})
+      get_filename_component(_tmp ${l} PATH)
+      list(APPEND OpenCV_CUDA_LIBS_RELPATH ${_tmp})
+    endforeach()
+
+    list(REMOVE_DUPLICATES OpenCV_CUDA_LIBS_RELPATH)
+    link_directories(${OpenCV_CUDA_LIBS_RELPATH})
   endif()
 endforeach()
 
diff --git a/doc/conf.py b/doc/conf.py
index 0112725b9..c1e85cb82 100755
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -304,11 +304,11 @@ extlinks = {
             'oldbasicstructures' : ('http://docs.opencv.org/modules/core/doc/old_basic_structures.html#%s', None),
             'readwriteimagevideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None),
             'operationsonarrays' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html#%s', None),
-            'utilitysystemfunctions':('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html#%s', None),
-            'imgprocfilter':('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
-            'svms':('http://docs.opencv.org/modules/ml/doc/support_vector_machines.html#%s', None),
-            'drawingfunc':('http://docs.opencv.org/modules/core/doc/drawing_functions.html#%s', None),
-            'xmlymlpers':('http://docs.opencv.org/modules/core/doc/xml_yaml_persistence.html#%s', None),
+            'utilitysystemfunctions' : ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html#%s', None),
+            'imgprocfilter' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
+            'svms' : ('http://docs.opencv.org/modules/ml/doc/support_vector_machines.html#%s', None),
+            'drawingfunc' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#%s', None),
+            'xmlymlpers' : ('http://docs.opencv.org/modules/core/doc/xml_yaml_persistence.html#%s', None),
             'hgvideo' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#%s', None),
             'gpuinit' : ('http://docs.opencv.org/modules/gpu/doc/initalization_and_information.html#%s', None),
             'gpudatastructure' : ('http://docs.opencv.org/modules/gpu/doc/data_structures.html#%s', None),
@@ -316,56 +316,58 @@ extlinks = {
             'gpuperelement' : ('http://docs.opencv.org/modules/gpu/doc/per_element_operations.html#%s', None),
             'gpuimgproc' : ('http://docs.opencv.org/modules/gpu/doc/image_processing.html#%s', None),
             'gpumatrixreduct' : ('http://docs.opencv.org/modules/gpu/doc/matrix_reductions.html#%s', None),
-            'filtering':('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
+            'filtering' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html#%s', None),
             'flann' : ('http://docs.opencv.org/modules/flann/doc/flann_fast_approximate_nearest_neighbor_search.html#%s', None ),
             'calib3d' : ('http://docs.opencv.org/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.html#%s', None ),
             'feature2d' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html#%s', None ),
             'imgproc_geometric' : ('http://docs.opencv.org/modules/imgproc/doc/geometric_transformations.html#%s', None ),
+            'miscellaneous_transformations' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html#%s', None),
+            'user_interface' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html#%s', None),
 
             # 'opencv_group' : ('http://answers.opencv.org/%s', None),
             'opencv_qa' : ('http://answers.opencv.org/%s', None),
             'how_to_contribute' : ('http://code.opencv.org/projects/opencv/wiki/How_to_contribute/%s', None),
 
-            'cvt_color': ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=cvtcolor#cvtcolor%s', None),
-            'imread':    ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imread#imread%s', None),
-            'imwrite':   ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imwrite#imwrite%s', None),
-            'imshow':    ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=imshow#imshow%s', None),
-            'named_window': ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=namedwindow#namedwindow%s', None),
-            'wait_key': ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=waitkey#waitkey%s', None),
-            'add_weighted': ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=addweighted#addweighted%s', None),
-            'saturate_cast': ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html?highlight=saturate_cast#saturate-cast%s', None),
-            'mat_zeros': ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=zeros#mat-zeros%s', None),
-            'convert_to': ('http://docs.opencv.org/modules/core/doc/basic_structures.html#mat-convertto%s', None),
-            'create_trackbar': ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=createtrackbar#createtrackbar%s', None),
-            'point': ('http://docs.opencv.org/modules/core/doc/basic_structures.html#point%s', None),
-            'scalar': ('http://docs.opencv.org/modules/core/doc/basic_structures.html#scalar%s', None),
-            'line': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#line%s', None),
-            'ellipse': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#ellipse%s', None),
-            'rectangle': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#rectangle%s', None),
-            'circle': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#circle%s', None),
-            'fill_poly': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#fillpoly%s', None),
-            'rng': ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=rng#rng%s', None),
-            'put_text': ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#puttext%s', None),
-            'gaussian_blur': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur%s', None),
-            'blur': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=blur#blur%s', None),
-            'median_blur': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=medianblur#medianblur%s', None),
-            'bilateral_filter': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=bilateralfilter#bilateralfilter%s', None),
-            'erode': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=erode#erode%s', None),
-            'dilate': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=dilate#dilate%s', None),
-            'get_structuring_element': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=getstructuringelement#getstructuringelement%s', None),
-            'flood_fill': ( 'http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=floodfill#floodfill%s', None),
-            'morphology_ex': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=morphologyex#morphologyex%s', None),
-            'pyr_down': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrdown#pyrdown%s', None),
-            'pyr_up': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrup#pyrup%s', None),
-            'resize': ('http://docs.opencv.org/modules/imgproc/doc/geometric_transformations.html?highlight=resize#resize%s', None),
-            'threshold': ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=threshold#threshold%s', None),
-            'filter2d': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=filter2d#filter2d%s', None),
-            'copy_make_border': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=copymakeborder#copymakeborder%s', None),
-            'sobel': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=sobel#sobel%s', None),
-            'scharr': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=scharr#scharr%s', None),
-            'laplacian': ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=laplacian#laplacian%s', None),
-            'canny': ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=canny#canny%s', None),
-            'copy_to': ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=copyto#mat-copyto%s', None),
+            'cvt_color' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=cvtcolor#cvtcolor%s', None),
+            'imread' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imread#imread%s', None),
+            'imwrite' : ('http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html?highlight=imwrite#imwrite%s', None),
+            'imshow' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=imshow#imshow%s', None),
+            'named_window' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=namedwindow#namedwindow%s', None),
+            'wait_key' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=waitkey#waitkey%s', None),
+            'add_weighted' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=addweighted#addweighted%s', None),
+            'saturate_cast' : ('http://docs.opencv.org/modules/core/doc/utility_and_system_functions_and_macros.html?highlight=saturate_cast#saturate-cast%s', None),
+            'mat_zeros' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=zeros#mat-zeros%s', None),
+            'convert_to' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#mat-convertto%s', None),
+            'create_trackbar' : ('http://docs.opencv.org/modules/highgui/doc/user_interface.html?highlight=createtrackbar#createtrackbar%s', None),
+            'point' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#point%s', None),
+            'scalar' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html#scalar%s', None),
+            'line' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#line%s', None),
+            'ellipse' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#ellipse%s', None),
+            'rectangle' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#rectangle%s', None),
+            'circle' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#circle%s', None),
+            'fill_poly' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#fillpoly%s', None),
+            'rng' : ('http://docs.opencv.org/modules/core/doc/operations_on_arrays.html?highlight=rng#rng%s', None),
+            'put_text' : ('http://docs.opencv.org/modules/core/doc/drawing_functions.html#puttext%s', None),
+            'gaussian_blur' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=gaussianblur#gaussianblur%s', None),
+            'blur' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=blur#blur%s', None),
+            'median_blur' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=medianblur#medianblur%s', None),
+            'bilateral_filter' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=bilateralfilter#bilateralfilter%s', None),
+            'erode' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=erode#erode%s', None),
+            'dilate' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=dilate#dilate%s', None),
+            'get_structuring_element' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=getstructuringelement#getstructuringelement%s', None),
+            'flood_fill' : ( 'http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=floodfill#floodfill%s', None),
+            'morphology_ex' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=morphologyex#morphologyex%s', None),
+            'pyr_down' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrdown#pyrdown%s', None),
+            'pyr_up' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=pyrup#pyrup%s', None),
+            'resize' : ('http://docs.opencv.org/modules/imgproc/doc/geometric_transformations.html?highlight=resize#resize%s', None),
+            'threshold' : ('http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html?highlight=threshold#threshold%s', None),
+            'filter2d' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=filter2d#filter2d%s', None),
+            'copy_make_border' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=copymakeborder#copymakeborder%s', None),
+            'sobel' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=sobel#sobel%s', None),
+            'scharr' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=scharr#scharr%s', None),
+            'laplacian' : ('http://docs.opencv.org/modules/imgproc/doc/filtering.html?highlight=laplacian#laplacian%s', None),
+            'canny' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=canny#canny%s', None),
+            'copy_to' : ('http://docs.opencv.org/modules/core/doc/basic_structures.html?highlight=copyto#mat-copyto%s', None),
             'hough_lines' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghlines#houghlines%s', None),
             'hough_lines_p' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghlinesp#houghlinesp%s', None),
             'hough_circles' : ('http://docs.opencv.org/modules/imgproc/doc/feature_detection.html?highlight=houghcircles#houghcircles%s', None),
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
index 6b7c661cc..8220fb501 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
@@ -123,7 +123,7 @@ Let (x,y) be the top-left coordinate of the rectangle and (w,h) be its width and
 
 7.b. Rotated Rectangle
 -----------------------
-Here, bounding rectangle is drawn with minimum area, so it considers the rotation also. The function used is **cv2.minAreaRect()**. It returns a Box2D structure which contains following detals - ( top-left corner(x,y), (width, height), angle of rotation ). But to draw this rectangle, we need 4 corners of the rectangle. It is obtained by the function **cv2.boxPoints()**
+Here, bounding rectangle is drawn with minimum area, so it considers the rotation also. The function used is **cv2.minAreaRect()**. It returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ). But to draw this rectangle, we need 4 corners of the rectangle. It is obtained by the function **cv2.boxPoints()**
 ::
 
     rect = cv2.minAreaRect(cnt)
diff --git a/doc/py_tutorials/py_setup/py_intro/py_intro.rst b/doc/py_tutorials/py_setup/py_intro/py_intro.rst
index 10622c640..65b30c2e5 100644
--- a/doc/py_tutorials/py_setup/py_intro/py_intro.rst
+++ b/doc/py_tutorials/py_setup/py_intro/py_intro.rst
@@ -7,45 +7,41 @@ Introduction to OpenCV-Python Tutorials
 OpenCV
 ===============
 
-OpenCV was started at Intel in 1999 by **Gary Bradsky** and the first release came out in 2000. **Vadim Pisarevsky** joined Gary Bradsky to manage Intel's Russian software OpenCV team. In 2005, OpenCV was used on Stanley, the vehicle who won 2005 DARPA Grand Challenge. Later its active development continued under the support of Willow Garage, with Gary Bradsky and Vadim Pisarevsky leading the project. Right now, OpenCV supports a lot of algorithms related to Computer Vision and Machine Learning and it is expanding day-by-day.
+OpenCV was started at Intel in 1999 by **Gary Bradsky**, and the first release came out in 2000. **Vadim Pisarevsky** joined Gary Bradsky to manage Intel's Russian software OpenCV team. In 2005, OpenCV was used on Stanley, the vehicle that won the 2005 DARPA Grand Challenge. Later, its active development continued under the support of Willow Garage with Gary Bradsky and Vadim Pisarevsky leading the project. OpenCV now supports a multitude of algorithms related to Computer Vision and Machine Learning and is expanding day by day.
 
-Currently OpenCV supports a wide variety of programming languages like C++, Python, Java etc and is available on different platforms including Windows, Linux, OS X, Android, iOS etc. Also, interfaces based on CUDA and OpenCL are also under active development for high-speed GPU operations.
+OpenCV supports a wide variety of programming languages such as C++, Python, Java, etc., and is available on different platforms including Windows, Linux, OS X, Android, and iOS. Interfaces for high-speed GPU operations based on CUDA and OpenCL are also under active development.
 
-OpenCV-Python is the Python API of OpenCV. It combines the best qualities of OpenCV C++ API and Python language.
+OpenCV-Python is the Python API for OpenCV, combining the best qualities of the OpenCV C++ API and the Python language.
 
 
 OpenCV-Python
 ===============
 
-Python is a general purpose programming language started by **Guido van Rossum**, which became very popular in short time mainly because of its simplicity and code readability. It enables the programmer to express his ideas in fewer lines of code without reducing any readability.
+OpenCV-Python is a library of Python bindings designed to solve computer vision problems.
 
-Compared to other languages like C/C++, Python is slower. But another important feature of Python is that it can be easily extended with C/C++. This feature helps us to write computationally intensive codes in C/C++ and create a Python wrapper for it so that we can use these wrappers as Python modules. This gives us two advantages: first, our code is as fast as original C/C++ code (since it is the actual C++ code working in background) and second, it is very easy to code in Python. This is how OpenCV-Python works, it is a Python wrapper around original C++ implementation.
+Python is a general purpose programming language started by **Guido van Rossum** that became very popular very quickly, mainly because of its simplicity and code readability. It enables the programmer to express ideas in fewer lines of code without reducing readability.
 
-And the support of Numpy makes the task more easier. **Numpy** is a highly optimized library for numerical operations. It gives a MATLAB-style syntax. All the OpenCV array structures are converted to-and-from Numpy arrays. So whatever operations you can do in Numpy, you can combine it with OpenCV, which increases number of weapons in your arsenal. Besides that, several other libraries like SciPy, Matplotlib which supports Numpy can be used with this.
+Compared to languages like C/C++, Python is slower. That said, Python can be easily extended with C/C++, which allows us to write computationally intensive code in C/C++ and create Python wrappers that can be used as Python modules. This gives us two advantages: first, the code is as fast as the original C/C++ code (since it is the actual C++ code working in background) and second, it easier to code in Python than C/C++. OpenCV-Python is a Python wrapper for the original OpenCV C++ implementation.
 
-So OpenCV-Python is an appropriate tool for fast prototyping of computer vision problems.
+OpenCV-Python makes use of **Numpy**, which is a highly optimized library for numerical operations with a MATLAB-style syntax. All the OpenCV array structures are converted to and from Numpy arrays. This also makes it easier to integrate with other libraries that use Numpy such as SciPy and Matplotlib.
 
 
 OpenCV-Python Tutorials
 =============================
 
-OpenCV introduces a new set of tutorials which will guide you through various functions available in OpenCV-Python. **This guide is mainly focused on OpenCV 3.x version** (although most of the tutorials will work with OpenCV 2.x also).
+OpenCV introduces a new set of tutorials which will guide you through various functions available in OpenCV-Python. **This guide is mainly focused on OpenCV 3.x version** (although most of the tutorials will also work with OpenCV 2.x).
 
-A prior knowledge on Python and Numpy is required before starting because they won't be covered in this guide. **Especially, a good knowledge on Numpy is must to write optimized codes in OpenCV-Python.**
+Prior knowledge of Python and Numpy is recommended as they won't be covered in this guide. **Proficiency with Numpy is a must in order to write optimized code using OpenCV-Python.**
 
-This tutorial has been started by *Abid Rahman K.* as part of Google Summer of Code 2013 program, under the guidance of *Alexander Mordvintsev*.
+This tutorial was originally started by *Abid Rahman K.* as part of the Google Summer of Code 2013 program under the guidance of *Alexander Mordvintsev*.
 
 
 OpenCV Needs You !!!
 ==========================
 
-Since OpenCV is an open source initiative, all are welcome to make contributions to this library. And it is same for this tutorial also.
+Since OpenCV is an open source initiative, all are welcome to make contributions to the library, documentation, and tutorials. If you find any mistake in this tutorial (from a small spelling mistake to an egregious error in code or concept), feel free to correct it by cloning OpenCV in `GitHub <https://github.com/Itseez/opencv>`_ and submitting a pull request. OpenCV developers will check your pull request, give you important feedback and (once it passes the approval of the reviewer) it will be merged into OpenCV. You will then become an open source contributor :-)
 
-So, if you find any mistake in this tutorial (whether it be a small spelling mistake or a big error in code or concepts, whatever), feel free to correct it.
-
-And that will be a good task for freshers who begin to contribute to open source projects. Just fork the OpenCV in github, make necessary corrections and send a pull request to OpenCV. OpenCV developers will check your pull request, give you important feedback and once it passes the approval of the reviewer, it will be merged to OpenCV. Then you become a open source contributor. Similar is the case with other tutorials, documentation etc.
-
-As new modules are added to OpenCV-Python, this tutorial will have to be expanded. So those who knows about particular algorithm can write up a tutorial which includes a basic theory of the algorithm and a code showing basic usage of the algorithm and submit it to OpenCV.
+As new modules are added to OpenCV-Python, this tutorial will have to be expanded. If you are familiar with a particular algorithm and can write up a tutorial including basic theory of the algorithm and code showing example usage, please do so.
 
 Remember, we **together** can make this project a great success !!!
 
diff --git a/doc/py_tutorials/py_video/py_meanshift/py_meanshift.rst b/doc/py_tutorials/py_video/py_meanshift/py_meanshift.rst
index a111311af..87ece6935 100644
--- a/doc/py_tutorials/py_video/py_meanshift/py_meanshift.rst
+++ b/doc/py_tutorials/py_video/py_meanshift/py_meanshift.rst
@@ -52,7 +52,7 @@ To use meanshift in OpenCV, first we need to setup the target, find its histogra
 
     # set up the ROI for tracking
     roi = frame[r:r+h, c:c+w]
-    hsv_roi =  cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
+    hsv_roi =  cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
     mask = cv2.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
     roi_hist = cv2.calcHist([hsv_roi],[0],mask,[180],[0,180])
     cv2.normalize(roi_hist,roi_hist,0,255,cv2.NORM_MINMAX)
@@ -127,7 +127,7 @@ It is almost same as meanshift, but it returns a rotated rectangle (that is our
 
     # set up the ROI for tracking
     roi = frame[r:r+h, c:c+w]
-    hsv_roi =  cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
+    hsv_roi =  cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
     mask = cv2.inRange(hsv_roi, np.array((0., 60.,32.)), np.array((180.,255.,255.)))
     roi_hist = cv2.calcHist([hsv_roi],[0],mask,[180],[0,180])
     cv2.normalize(roi_hist,roi_hist,0,255,cv2.NORM_MINMAX)
diff --git a/doc/tutorials/core/adding_images/adding_images.rst b/doc/tutorials/core/adding_images/adding_images.rst
index 3559132dc..a6d201e43 100644
--- a/doc/tutorials/core/adding_images/adding_images.rst
+++ b/doc/tutorials/core/adding_images/adding_images.rst
@@ -6,12 +6,12 @@ Adding (blending) two images using OpenCV
 Goal
 =====
 
-In this tutorial you will learn how to:
+In this tutorial you will learn:
 
 .. container:: enumeratevisibleitemswithsquare
 
-   * What is *linear blending* and why it is useful.
-   * Add two images using :add_weighted:`addWeighted <>`
+   * what is *linear blending* and why it is useful;
+   * how to add two images using :add_weighted:`addWeighted <>`
 
 Theory
 =======
diff --git a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
index ef0f8640c..b6a18fee8 100644
--- a/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
+++ b/doc/tutorials/core/how_to_scan_images/how_to_scan_images.rst
@@ -18,7 +18,7 @@ We'll seek answers for the following questions:
 Our test case
 =============
 
-Let us consider a simple color reduction method. Using the unsigned char C and C++ type for matrix item storing a channel of pixel may have up to 256 different values. For a three channel image this can allow the formation of way too many colors (16 million to be exact). Working with so many color shades may give a heavy blow to our algorithm performance. However, sometimes it is enough to work with a lot less of them to get the same final result.
+Let us consider a simple color reduction method. By using the unsigned char C and C++ type for matrix item storing, a channel of pixel may have up to 256 different values. For a three channel image this can allow the formation of way too many colors (16 million to be exact). Working with so many color shades may give a heavy blow to our algorithm performance. However, sometimes it is enough to work with a lot less of them to get the same final result.
 
 In this cases it's common that we make a *color space reduction*. This means that we divide the color space current value with a new input value to end up with fewer colors. For instance every value between zero and nine takes the new value zero, every value between ten and nineteen the value ten and so on.
 
diff --git a/doc/tutorials/core/mat-mask-operations/mat-mask-operations.rst b/doc/tutorials/core/mat-mask-operations/mat-mask-operations.rst
index 0549a9c12..f28920e77 100644
--- a/doc/tutorials/core/mat-mask-operations/mat-mask-operations.rst
+++ b/doc/tutorials/core/mat-mask-operations/mat-mask-operations.rst
@@ -32,14 +32,14 @@ Here's a function that will do this:
 
 .. code-block:: cpp
 
-   void Sharpen(const Mat& myImage,Mat& Result)
+   void Sharpen(const Mat& myImage, Mat& Result)
    {
        CV_Assert(myImage.depth() == CV_8U);  // accept only uchar images
 
-       Result.create(myImage.size(),myImage.type());
+       Result.create(myImage.size(), myImage.type());
        const int nChannels = myImage.channels();
 
-       for(int j = 1 ; j < myImage.rows-1; ++j)
+       for(int j = 1; j < myImage.rows - 1; ++j)
        {
            const uchar* previous = myImage.ptr<uchar>(j - 1);
            const uchar* current  = myImage.ptr<uchar>(j    );
@@ -47,17 +47,17 @@ Here's a function that will do this:
 
            uchar* output = Result.ptr<uchar>(j);
 
-           for(int i= nChannels;i < nChannels*(myImage.cols-1); ++i)
+           for(int i = nChannels; i < nChannels * (myImage.cols - 1); ++i)
            {
-               *output++ = saturate_cast<uchar>(5*current[i]
-                            -current[i-nChannels] - current[i+nChannels] - previous[i] - next[i]);
+               *output++ = saturate_cast<uchar>(5 * current[i]
+                            -current[i - nChannels] - current[i + nChannels] - previous[i] - next[i]);
            }
        }
 
        Result.row(0).setTo(Scalar(0));
-       Result.row(Result.rows-1).setTo(Scalar(0));
+       Result.row(Result.rows - 1).setTo(Scalar(0));
        Result.col(0).setTo(Scalar(0));
-       Result.col(Result.cols-1).setTo(Scalar(0));
+       Result.col(Result.cols - 1).setTo(Scalar(0));
    }
 
 At first we make sure that the input images data is in unsigned char format. For this we use the :utilitysystemfunctions:`CV_Assert <cv-assert>` function that throws an error when the expression inside it is false.
@@ -70,14 +70,14 @@ We create an output image with the same size and the same type as our input. As
 
 .. code-block:: cpp
 
-   Result.create(myImage.size(),myImage.type());
+   Result.create(myImage.size(), myImage.type());
    const int nChannels = myImage.channels();
 
 We'll use the plain C [] operator to access pixels. Because we need to access multiple rows at the same time we'll acquire the pointers for each of them (a previous, a current and a next line). We need another pointer to where we're going to save the calculation. Then simply access the right items with the [] operator. For moving the output pointer ahead we simply increase this (with one byte) after each operation:
 
 .. code-block:: cpp
 
-   for(int j = 1 ; j < myImage.rows-1; ++j)
+   for(int j = 1; j < myImage.rows - 1; ++j)
    {
        const uchar* previous = myImage.ptr<uchar>(j - 1);
        const uchar* current  = myImage.ptr<uchar>(j    );
@@ -85,21 +85,21 @@ We'll use the plain C [] operator to access pixels. Because we need to access mu
 
        uchar* output = Result.ptr<uchar>(j);
 
-       for(int i= nChannels;i < nChannels*(myImage.cols-1); ++i)
+       for(int i = nChannels; i < nChannels * (myImage.cols - 1); ++i)
        {
-           *output++ = saturate_cast<uchar>(5*current[i]
-                        -current[i-nChannels] - current[i+nChannels] - previous[i] - next[i]);
+           *output++ = saturate_cast<uchar>(5 * current[i]
+                        -current[i - nChannels] - current[i + nChannels] - previous[i] - next[i]);
        }
    }
 
-On the borders of the image the upper notation results inexistent pixel locations (like minus one - minus one). In these points our formula is undefined. A simple solution is to not apply the mask in these points and, for example, set the pixels on the borders to zeros:
+On the borders of the image the upper notation results inexistent pixel locations (like minus one - minus one). In these points our formula is undefined. A simple solution is to not apply the kernel in these points and, for example, set the pixels on the borders to zeros:
 
 .. code-block:: cpp
 
-   Result.row(0).setTo(Scalar(0));             // The top row
-   Result.row(Result.rows-1).setTo(Scalar(0)); // The bottom row
-   Result.col(0).setTo(Scalar(0));             // The left column
-   Result.col(Result.cols-1).setTo(Scalar(0)); // The right column
+   Result.row(0).setTo(Scalar(0));               // The top row
+   Result.row(Result.rows - 1).setTo(Scalar(0)); // The bottom row
+   Result.col(0).setTo(Scalar(0));               // The left column
+   Result.col(Result.cols - 1).setTo(Scalar(0)); // The right column
 
 The filter2D function
 =====================
@@ -116,7 +116,7 @@ Then call the :filtering:`filter2D <filter2d>` function specifying the input, th
 
 .. code-block:: cpp
 
-   filter2D(I, K, I.depth(), kern );
+   filter2D(I, K, I.depth(), kern);
 
 The function even has a fifth optional argument to specify the center of the kernel, and a sixth one for determining what to do in the regions where the operation is undefined (borders). Using this function has the advantage that it's shorter, less verbose and because there are some optimization techniques implemented it is usually faster than the *hand-coded method*. For example in my test while the second one took only 13 milliseconds the first took around 31 milliseconds. Quite some difference.
 
diff --git a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
index 171d2e683..de38a858d 100644
--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.rst
@@ -45,7 +45,7 @@ All the above objects, in the end, point to the same single data matrix. Their h
    :linenos:
 
    Mat D (A, Rect(10, 10, 100, 100) ); // using a rectangle
-   Mat E = A(Range:all(), Range(1,3)); // using row and column boundaries
+   Mat E = A(Range::all(), Range(1,3)); // using row and column boundaries
 
 Now you may ask if the matrix itself may belong to multiple *Mat* objects who takes responsibility for cleaning it up when it's no longer needed. The short answer is: the last object that used it. This is handled by using a reference counting mechanism. Whenever somebody copies a header of a *Mat* object, a counter is increased for the matrix. Whenever a header is cleaned this counter is decreased. When the counter reaches zero the matrix too is freed. Sometimes you will want to copy the matrix itself too, so OpenCV provides the :basicstructures:`clone() <mat-clone>` and :basicstructures:`copyTo() <mat-copyto>` functions.
 
@@ -86,7 +86,7 @@ Each of the building components has their own valid domains. This leads to the d
 Creating a *Mat* object explicitly
 ==================================
 
-In the :ref:`Load_Save_Image` tutorial you have already learned how to write a matrix to an image file by using the :readWriteImageVideo:` imwrite() <imwrite>` function. However, for debugging purposes it's much more convenient to see the actual values. You can do this using the << operator of *Mat*. Be aware that this only works for two dimensional matrices.
+In the :ref:`Load_Save_Image` tutorial you have already learned how to write a matrix to an image file by using the :readwriteimagevideo:`imwrite() <imwrite>` function. However, for debugging purposes it's much more convenient to see the actual values. You can do this using the << operator of *Mat*. Be aware that this only works for two dimensional matrices.
 
 Although *Mat* works really well as an image container, it is also a general matrix class. Therefore, it is possible to create and manipulate multidimensional matrices. You can create a Mat object in multiple ways:
 
diff --git a/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.rst b/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.rst
index 6a8345d69..f5f636d08 100644
--- a/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.rst
+++ b/doc/tutorials/imgproc/histograms/histogram_comparison/histogram_comparison.rst
@@ -84,88 +84,10 @@ Code
 
    * **Code at glance:**
 
-.. code-block:: cpp
+.. literalinclude:: ../../../../../samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
+   :language: cpp
+   :tab-width: 4
 
-   #include "opencv2/highgui.hpp"
-   #include "opencv2/imgproc.hpp"
-   #include <iostream>
-   #include <stdio.h>
-
-   using namespace std;
-   using namespace cv;
-
-   /** @function main */
-   int main( int argc, char** argv )
-   {
-     Mat src_base, hsv_base;
-     Mat src_test1, hsv_test1;
-     Mat src_test2, hsv_test2;
-     Mat hsv_half_down;
-
-     /// Load three images with different environment settings
-     if( argc < 4 )
-       { printf("** Error. Usage: ./compareHist_Demo <image_settings0> <image_setting1> <image_settings2>\n");
-         return -1;
-       }
-
-     src_base = imread( argv[1], 1 );
-     src_test1 = imread( argv[2], 1 );
-     src_test2 = imread( argv[3], 1 );
-
-     /// Convert to HSV
-     cvtColor( src_base, hsv_base, CV_BGR2HSV );
-     cvtColor( src_test1, hsv_test1, CV_BGR2HSV );
-     cvtColor( src_test2, hsv_test2, CV_BGR2HSV );
-
-     hsv_half_down = hsv_base( Range( hsv_base.rows/2, hsv_base.rows - 1 ), Range( 0, hsv_base.cols - 1 ) );
-
-     /// Using 30 bins for hue and 32 for saturation
-     int h_bins = 50; int s_bins = 60;
-     int histSize[] = { h_bins, s_bins };
-
-     // hue varies from 0 to 256, saturation from 0 to 180
-     float h_ranges[] = { 0, 256 };
-     float s_ranges[] = { 0, 180 };
-
-     const float* ranges[] = { h_ranges, s_ranges };
-
-     // Use the o-th and 1-st channels
-     int channels[] = { 0, 1 };
-
-     /// Histograms
-     MatND hist_base;
-     MatND hist_half_down;
-     MatND hist_test1;
-     MatND hist_test2;
-
-     /// Calculate the histograms for the HSV images
-     calcHist( &hsv_base, 1, channels, Mat(), hist_base, 2, histSize, ranges, true, false );
-     normalize( hist_base, hist_base, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     calcHist( &hsv_half_down, 1, channels, Mat(), hist_half_down, 2, histSize, ranges, true, false );
-     normalize( hist_half_down, hist_half_down, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     calcHist( &hsv_test1, 1, channels, Mat(), hist_test1, 2, histSize, ranges, true, false );
-     normalize( hist_test1, hist_test1, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     calcHist( &hsv_test2, 1, channels, Mat(), hist_test2, 2, histSize, ranges, true, false );
-     normalize( hist_test2, hist_test2, 0, 1, NORM_MINMAX, -1, Mat() );
-
-     /// Apply the histogram comparison methods
-     for( int i = 0; i < 4; i++ )
-        { int compare_method = i;
-          double base_base = compareHist( hist_base, hist_base, compare_method );
-          double base_half = compareHist( hist_base, hist_half_down, compare_method );
-          double base_test1 = compareHist( hist_base, hist_test1, compare_method );
-          double base_test2 = compareHist( hist_base, hist_test2, compare_method );
-
-          printf( " Method [%d] Perfect, Base-Half, Base-Test(1), Base-Test(2) : %f, %f, %f, %f \n", i, base_base, base_half , base_test1, base_test2 );
-        }
-
-     printf( "Done \n" );
-
-     return 0;
-    }
 
 
 Explanation
@@ -211,11 +133,11 @@ Explanation
 
    .. code-block:: cpp
 
-     int h_bins = 50; int s_bins = 32;
+     int h_bins = 50; int s_bins = 60;
      int histSize[] = { h_bins, s_bins };
 
-     float h_ranges[] = { 0, 256 };
-     float s_ranges[] = { 0, 180 };
+     float h_ranges[] = { 0, 180 };
+     float s_ranges[] = { 0, 256 };
 
      const float* ranges[] = { h_ranges, s_ranges };
 
diff --git a/doc/tutorials/introduction/load_save_image/load_save_image.rst b/doc/tutorials/introduction/load_save_image/load_save_image.rst
index 57d55d3a9..dec75c195 100644
--- a/doc/tutorials/introduction/load_save_image/load_save_image.rst
+++ b/doc/tutorials/introduction/load_save_image/load_save_image.rst
@@ -5,7 +5,7 @@ Load, Modify, and Save an Image
 
 .. note::
 
-   We assume that by now you know how to load an image using :imread:`imread <>` and to display it in a window (using :imshow:`imshow <>`). Read the :ref:`Display_Image` tutorial otherwise.
+   We assume that by now you know how to load an image using :readwriteimagevideo:`imread <imread>` and to display it in a window (using :user_interface:`imshow <imshow>`). Read the :ref:`Display_Image` tutorial otherwise.
 
 Goals
 ======
@@ -14,9 +14,9 @@ In this tutorial you will learn how to:
 
 .. container:: enumeratevisibleitemswithsquare
 
-   * Load an image using :imread:`imread <>`
-   * Transform an image from BGR to Grayscale format by using :cvt_color:`cvtColor <>`
-   * Save your transformed image in a file on disk (using :imwrite:`imwrite <>`)
+   * Load an image using :readwriteimagevideo:`imread <imread>`
+   * Transform an image from BGR to Grayscale format by using :miscellaneous_transformations:`cvtColor <cvtcolor>`
+   * Save your transformed image in a file on disk (using :readwriteimagevideo:`imwrite <imwrite>`)
 
 Code
 ======
@@ -62,10 +62,7 @@ Here it is:
 Explanation
 ============
 
-#. We begin by:
-
-   * Creating a Mat object to store the image information
-   * Load an image using :imread:`imread <>`, located in the path given by *imageName*. Fort this example, assume you are loading a RGB image.
+#. We begin by loading an image using :readwriteimagevideo:`imread <imread>`, located in the path given by *imageName*. For this example, assume you are loading a RGB image.
 
 #. Now we are going to convert our image from BGR to Grayscale format. OpenCV has a really nice function to do this kind of transformations:
 
@@ -73,15 +70,15 @@ Explanation
 
       cvtColor( image, gray_image, CV_BGR2GRAY );
 
-   As you can see, :cvt_color:`cvtColor <>` takes as arguments:
+   As you can see, :miscellaneous_transformations:`cvtColor <cvtcolor>` takes as arguments:
 
    .. container:: enumeratevisibleitemswithsquare
 
       * a source image (*image*)
       * a destination image (*gray_image*), in which we will save the converted image.
-      * an additional parameter that indicates what kind of transformation will be performed. In this case we use **CV_BGR2GRAY** (because of :imread:`imread <>` has BGR default channel order in case of color images).
+      * an additional parameter that indicates what kind of transformation will be performed. In this case we use **CV_BGR2GRAY** (because of :readwriteimagevideo:`imread <imread>` has BGR default channel order in case of color images).
 
-#. So now we have our new *gray_image* and want to save it on disk (otherwise it will get lost after the program ends). To save it, we will use a function analagous to :imread:`imread <>`: :imwrite:`imwrite <>`
+#. So now we have our new *gray_image* and want to save it on disk (otherwise it will get lost after the program ends). To save it, we will use a function analagous to :readwriteimagevideo:`imread <imread>`: :readwriteimagevideo:`imwrite <imwrite>`
 
    .. code-block:: cpp
 
diff --git a/modules/androidcamera/camera_wrapper/CMakeLists.txt b/modules/androidcamera/camera_wrapper/CMakeLists.txt
index bc5585a7a..d08e2c469 100644
--- a/modules/androidcamera/camera_wrapper/CMakeLists.txt
+++ b/modules/androidcamera/camera_wrapper/CMakeLists.txt
@@ -58,7 +58,7 @@ SET_TARGET_PROPERTIES(${the_target} PROPERTIES
                       RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
                       )
 
-if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
+if (NOT (CMAKE_BUILD_TYPE MATCHES "Debug"))
     ADD_CUSTOM_COMMAND( TARGET ${the_target} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${LIBRARY_OUTPUT_PATH}/lib${the_target}.so" )
 endif()
 
diff --git a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
index 5ca1778a5..0ed301323 100644
--- a/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
+++ b/modules/androidcamera/camera_wrapper/camera_wrapper.cpp
@@ -61,6 +61,12 @@
 
 using namespace android;
 
+// non-public camera related classes are not binary compatible
+// objects of these classes have different sizeof on different platforms
+// additional memory tail to all system objects to overcome sizeof issue
+#define MAGIC_TAIL 4096
+
+
 void debugShowFPS();
 
 #if defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0)
@@ -90,6 +96,7 @@ public:
 };
 #endif
 
+
 std::string getProcessName()
 {
     std::string result;
@@ -142,12 +149,22 @@ class CameraHandler: public CameraListener
 protected:
     int cameraId;
     sp<Camera> camera;
-    CameraParameters params;
+#if defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
+    sp<SurfaceTexture> surface;
+#endif
+#if defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0) || defined(ANDROID_r4_4_0)
+    sp<BufferQueue> queue;
+    sp<ConsumerListenerStub> listener;
+#endif
+    CameraParameters* params;
     CameraCallback cameraCallback;
     void* userData;
 
     int emptyCameraCallbackReported;
 
+    int width;
+    int height;
+
     static const char* flashModesNames[ANDROID_CAMERA_FLASH_MODES_NUM];
     static const char* focusModesNames[ANDROID_CAMERA_FOCUS_MODES_NUM];
     static const char* whiteBalanceModesNames[ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM];
@@ -258,7 +275,7 @@ protected:
 
     int is_supported(const char* supp_modes_key, const char* mode)
     {
-        const char* supported_modes = params.get(supp_modes_key);
+        const char* supported_modes = params->get(supp_modes_key);
         return (supported_modes && mode && (strstr(supported_modes, mode) > 0));
     }
 
@@ -268,7 +285,7 @@ protected:
         if (focus_distance_type >= 0 && focus_distance_type < 3)
     {
             float focus_distances[3];
-            const char* output = params.get(CameraParameters::KEY_FOCUS_DISTANCES);
+            const char* output = params->get(CameraParameters::KEY_FOCUS_DISTANCES);
             int val_num = CameraHandler::split_float(output, focus_distances, ',', 3);
             if(val_num == 3)
         {
@@ -300,10 +317,15 @@ public:
         emptyCameraCallbackReported(0)
     {
         LOGD("Instantiated new CameraHandler (%p, %p)", callback, _userData);
+        void* params_buffer = operator new(sizeof(CameraParameters) + MAGIC_TAIL);
+        params = new(params_buffer) CameraParameters();
     }
 
     virtual ~CameraHandler()
     {
+        if (params)
+            params->~CameraParameters();
+            operator delete(params);
         LOGD("CameraHandler destructor is called");
     }
 
@@ -371,10 +393,18 @@ const char* CameraHandler::focusModesNames[ANDROID_CAMERA_FOCUS_MODES_NUM] =
     CameraParameters::FOCUS_MODE_AUTO,
 #if !defined(ANDROID_r2_2_0)
     CameraParameters::FOCUS_MODE_CONTINUOUS_VIDEO,
+#else
+    CameraParameters::FOCUS_MODE_AUTO,
 #endif
     CameraParameters::FOCUS_MODE_EDOF,
     CameraParameters::FOCUS_MODE_FIXED,
-    CameraParameters::FOCUS_MODE_INFINITY
+    CameraParameters::FOCUS_MODE_INFINITY,
+    CameraParameters::FOCUS_MODE_MACRO,
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1)
+    CameraParameters::FOCUS_MODE_CONTINUOUS_PICTURE
+#else
+    CameraParameters::FOCUS_MODE_AUTO
+#endif
 };
 
 const char* CameraHandler::whiteBalanceModesNames[ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM] =
@@ -534,39 +564,39 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
     {
         LOGI("initCameraConnect: Setting paramers from previous camera handler");
         camera->setParameters(prevCameraParameters->flatten());
-        handler->params.unflatten(prevCameraParameters->flatten());
+        handler->params->unflatten(prevCameraParameters->flatten());
     }
     else
     {
         android::String8 params_str = camera->getParameters();
         LOGI("initCameraConnect: [%s]", params_str.string());
 
-        handler->params.unflatten(params_str);
+        handler->params->unflatten(params_str);
 
-        LOGD("Supported Cameras: %s", handler->params.get("camera-indexes"));
-        LOGD("Supported Picture Sizes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PICTURE_SIZES));
-        LOGD("Supported Picture Formats: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PICTURE_FORMATS));
-        LOGD("Supported Preview Sizes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES));
-        LOGD("Supported Preview Formats: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS));
-        LOGD("Supported Preview Frame Rates: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_FRAME_RATES));
-        LOGD("Supported Thumbnail Sizes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_JPEG_THUMBNAIL_SIZES));
-        LOGD("Supported Whitebalance Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_WHITE_BALANCE));
-        LOGD("Supported Effects: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_EFFECTS));
-        LOGD("Supported Scene Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_SCENE_MODES));
-        LOGD("Supported Focus Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES));
-        LOGD("Supported Antibanding Options: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_ANTIBANDING));
-        LOGD("Supported Flash Modes: %s", handler->params.get(CameraParameters::KEY_SUPPORTED_FLASH_MODES));
+        LOGD("Supported Cameras: %s", handler->params->get("camera-indexes"));
+        LOGD("Supported Picture Sizes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PICTURE_SIZES));
+        LOGD("Supported Picture Formats: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PICTURE_FORMATS));
+        LOGD("Supported Preview Sizes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES));
+        LOGD("Supported Preview Formats: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS));
+        LOGD("Supported Preview Frame Rates: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_FRAME_RATES));
+        LOGD("Supported Thumbnail Sizes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_JPEG_THUMBNAIL_SIZES));
+        LOGD("Supported Whitebalance Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_WHITE_BALANCE));
+        LOGD("Supported Effects: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_EFFECTS));
+        LOGD("Supported Scene Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_SCENE_MODES));
+        LOGD("Supported Focus Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES));
+        LOGD("Supported Antibanding Options: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_ANTIBANDING));
+        LOGD("Supported Flash Modes: %s", handler->params->get(CameraParameters::KEY_SUPPORTED_FLASH_MODES));
 
 #if !defined(ANDROID_r2_2_0)
         // Set focus mode to continuous-video if supported
-        const char* available_focus_modes = handler->params.get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES);
+        const char* available_focus_modes = handler->params->get(CameraParameters::KEY_SUPPORTED_FOCUS_MODES);
         if (available_focus_modes != 0)
         {
             if (strstr(available_focus_modes, "continuous-video") != NULL)
             {
-                handler->params.set(CameraParameters::KEY_FOCUS_MODE, CameraParameters::FOCUS_MODE_CONTINUOUS_VIDEO);
+                handler->params->set(CameraParameters::KEY_FOCUS_MODE, CameraParameters::FOCUS_MODE_CONTINUOUS_VIDEO);
 
-                status_t resParams = handler->camera->setParameters(handler->params.flatten());
+                status_t resParams = handler->camera->setParameters(handler->params->flatten());
 
                 if (resParams != 0)
                 {
@@ -581,7 +611,7 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
 #endif
 
         //check if yuv420sp format available. Set this format as preview format.
-        const char* available_formats = handler->params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS);
+        const char* available_formats = handler->params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_FORMATS);
         if (available_formats != 0)
         {
             const char* format_to_set = 0;
@@ -607,9 +637,9 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
 
             if (0 != format_to_set)
             {
-                handler->params.setPreviewFormat(format_to_set);
+                handler->params->setPreviewFormat(format_to_set);
 
-                status_t resParams = handler->camera->setParameters(handler->params.flatten());
+                status_t resParams = handler->camera->setParameters(handler->params->flatten());
 
                 if (resParams != 0)
                     LOGE("initCameraConnect: failed to set preview format to %s", format_to_set);
@@ -617,6 +647,13 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
                     LOGD("initCameraConnect: preview format is set to %s", format_to_set);
             }
         }
+
+        handler->params->setPreviewSize(640, 480);
+        status_t resParams = handler->camera->setParameters(handler->params->flatten());
+        if (resParams != 0)
+            LOGE("initCameraConnect: failed to set preview resolution to 640x480");
+        else
+            LOGD("initCameraConnect: preview format is set to 640x480");
     }
 
     status_t bufferStatus;
@@ -627,22 +664,27 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
 #elif defined(ANDROID_r2_3_3)
     /* Do nothing in case of 2.3 for now */
 #elif defined(ANDROID_r3_0_1) || defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
-    sp<SurfaceTexture> surfaceTexture = new SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
-    bufferStatus = camera->setPreviewTexture(surfaceTexture);
+    void* surface_texture_obj = operator new(sizeof(SurfaceTexture) + MAGIC_TAIL);
+    handler->surface = new(surface_texture_obj) SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
+    bufferStatus = camera->setPreviewTexture(handler->surface);
     if (bufferStatus != 0)
         LOGE("initCameraConnect: failed setPreviewTexture call (status %d); camera might not work correctly", bufferStatus);
 #elif defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<BufferQueue::ConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener);
-    bufferStatus = camera->setPreviewTexture(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    void* consumer_listener_obj = operator new(sizeof(ConsumerListenerStub) + MAGIC_TAIL);
+    handler->listener = new(consumer_listener_obj) ConsumerListenerStub();
+    handler->queue->consumerConnect(handler->listener);
+    bufferStatus = camera->setPreviewTexture(handler->queue);
     if (bufferStatus != 0)
         LOGE("initCameraConnect: failed setPreviewTexture call; camera might not work correctly");
 # elif defined(ANDROID_r4_4_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<IConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener, true);
-    bufferStatus = handler->camera->setPreviewTarget(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    void* consumer_listener_obj = operator new(sizeof(ConsumerListenerStub) + MAGIC_TAIL);
+    handler->listener = new(consumer_listener_obj) ConsumerListenerStub();
+    handler->queue->consumerConnect(handler->listener, true);
+    bufferStatus = handler->camera->setPreviewTarget(handler->queue);
     if (bufferStatus != 0)
         LOGE("applyProperties: failed setPreviewTexture call; camera might not work correctly");
 # endif
@@ -723,18 +765,18 @@ double CameraHandler::getProperty(int propIdx)
     case ANDROID_CAMERA_PROPERTY_FRAMEWIDTH:
     {
         int w,h;
-        params.getPreviewSize(&w, &h);
+        params->getPreviewSize(&w, &h);
         return w;
     }
     case ANDROID_CAMERA_PROPERTY_FRAMEHEIGHT:
     {
         int w,h;
-        params.getPreviewSize(&w, &h);
+        params->getPreviewSize(&w, &h);
         return h;
     }
     case ANDROID_CAMERA_PROPERTY_SUPPORTED_PREVIEW_SIZES_STRING:
     {
-        cameraPropertySupportedPreviewSizesString = params.get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES);
+        cameraPropertySupportedPreviewSizesString = params->get(CameraParameters::KEY_SUPPORTED_PREVIEW_SIZES);
         union {const char* str;double res;} u;
         memset(&u.res, 0, sizeof(u.res));
         u.str = cameraPropertySupportedPreviewSizesString.c_str();
@@ -742,7 +784,7 @@ double CameraHandler::getProperty(int propIdx)
     }
     case ANDROID_CAMERA_PROPERTY_PREVIEW_FORMAT_STRING:
     {
-        const char* fmt = params.get(CameraParameters::KEY_PREVIEW_FORMAT);
+        const char* fmt = params->get(CameraParameters::KEY_PREVIEW_FORMAT);
         if (fmt == CameraParameters::PIXEL_FORMAT_YUV422SP)
             fmt = "yuv422sp";
         else if (fmt == CameraParameters::PIXEL_FORMAT_YUV420SP)
@@ -762,44 +804,44 @@ double CameraHandler::getProperty(int propIdx)
     }
     case ANDROID_CAMERA_PROPERTY_EXPOSURE:
     {
-        int exposure = params.getInt(CameraParameters::KEY_EXPOSURE_COMPENSATION);
+        int exposure = params->getInt(CameraParameters::KEY_EXPOSURE_COMPENSATION);
         return exposure;
     }
     case ANDROID_CAMERA_PROPERTY_FPS:
     {
-        return params.getPreviewFrameRate();
+        return params->getPreviewFrameRate();
     }
     case ANDROID_CAMERA_PROPERTY_FLASH_MODE:
     {
         int flash_mode = getModeNum(CameraHandler::flashModesNames,
                                     ANDROID_CAMERA_FLASH_MODES_NUM,
-                                    params.get(CameraParameters::KEY_FLASH_MODE));
+                                    params->get(CameraParameters::KEY_FLASH_MODE));
         return flash_mode;
     }
     case ANDROID_CAMERA_PROPERTY_FOCUS_MODE:
     {
         int focus_mode = getModeNum(CameraHandler::focusModesNames,
                                     ANDROID_CAMERA_FOCUS_MODES_NUM,
-                                    params.get(CameraParameters::KEY_FOCUS_MODE));
+                                    params->get(CameraParameters::KEY_FOCUS_MODE));
         return focus_mode;
     }
     case ANDROID_CAMERA_PROPERTY_WHITE_BALANCE:
     {
         int white_balance = getModeNum(CameraHandler::whiteBalanceModesNames,
                                        ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM,
-                                       params.get(CameraParameters::KEY_WHITE_BALANCE));
+                                       params->get(CameraParameters::KEY_WHITE_BALANCE));
         return white_balance;
     }
     case ANDROID_CAMERA_PROPERTY_ANTIBANDING:
     {
         int antibanding = getModeNum(CameraHandler::antibandingModesNames,
                                      ANDROID_CAMERA_ANTIBANDING_MODES_NUM,
-                                     params.get(CameraParameters::KEY_ANTIBANDING));
+                                     params->get(CameraParameters::KEY_ANTIBANDING));
         return antibanding;
     }
     case ANDROID_CAMERA_PROPERTY_FOCAL_LENGTH:
     {
-        float focal_length = params.getFloat(CameraParameters::KEY_FOCAL_LENGTH);
+        float focal_length = params->getFloat(CameraParameters::KEY_FOCAL_LENGTH);
         return focal_length;
     }
     case ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_NEAR:
@@ -814,6 +856,24 @@ double CameraHandler::getProperty(int propIdx)
     {
         return getFocusDistance(ANDROID_CAMERA_FOCUS_DISTANCE_FAR_INDEX);
     }
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1)
+    case ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK:
+    {
+        const char* status = params->get(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK);
+        if (status == CameraParameters::TRUE)
+            return 1.;
+        else
+            return 0.;
+    }
+    case ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK:
+    {
+        const char* status = params->get(CameraParameters::KEY_AUTO_EXPOSURE_LOCK);
+        if (status == CameraParameters::TRUE)
+            return 1.;
+        else
+            return 0.;
+    }
+#endif
     default:
         LOGW("CameraHandler::getProperty - Unsupported property.");
     };
@@ -824,99 +884,151 @@ void CameraHandler::setProperty(int propIdx, double value)
 {
     LOGD("CameraHandler::setProperty(%d, %f)", propIdx, value);
 
+    android::String8 params_str;
+    params_str = camera->getParameters();
+    LOGI("Params before set: [%s]", params_str.string());
+
     switch (propIdx)
     {
     case ANDROID_CAMERA_PROPERTY_FRAMEWIDTH:
     {
         int w,h;
-        params.getPreviewSize(&w, &h);
-        w = (int)value;
-        params.setPreviewSize(w, h);
+        params->getPreviewSize(&w, &h);
+        width = (int)value;
     }
     break;
     case ANDROID_CAMERA_PROPERTY_FRAMEHEIGHT:
     {
         int w,h;
-        params.getPreviewSize(&w, &h);
-        h = (int)value;
-        params.setPreviewSize(w, h);
+        params->getPreviewSize(&w, &h);
+        height = (int)value;
     }
     break;
     case ANDROID_CAMERA_PROPERTY_EXPOSURE:
     {
-        int max_exposure = params.getInt("max-exposure-compensation");
-        int min_exposure = params.getInt("min-exposure-compensation");
-        if(max_exposure && min_exposure){
+        int max_exposure = params->getInt("max-exposure-compensation");
+        int min_exposure = params->getInt("min-exposure-compensation");
+        if(max_exposure && min_exposure)
+        {
             int exposure = (int)value;
-            if(exposure >= min_exposure && exposure <= max_exposure){
-                params.set("exposure-compensation", exposure);
-            } else {
+            if(exposure >= min_exposure && exposure <= max_exposure)
+                params->set("exposure-compensation", exposure);
+            else
                 LOGE("Exposure compensation not in valid range (%i,%i).", min_exposure, max_exposure);
-            }
-        } else {
+        } else
             LOGE("Exposure compensation adjust is not supported.");
-        }
+
+        camera->setParameters(params->flatten());
     }
     break;
     case ANDROID_CAMERA_PROPERTY_FLASH_MODE:
     {
         int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_FLASH_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_FLASH_MODES_NUM)
+        {
             const char* mode_name = flashModesNames[new_val];
             if(is_supported(CameraParameters::KEY_SUPPORTED_FLASH_MODES, mode_name))
-                params.set(CameraParameters::KEY_FLASH_MODE, mode_name);
+                params->set(CameraParameters::KEY_FLASH_MODE, mode_name);
             else
                 LOGE("Flash mode %s is not supported.", mode_name);
-        } else {
-            LOGE("Flash mode value not in valid range.");
         }
+        else
+            LOGE("Flash mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
     }
     break;
     case ANDROID_CAMERA_PROPERTY_FOCUS_MODE:
     {
         int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_FOCUS_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_FOCUS_MODES_NUM)
+        {
             const char* mode_name = focusModesNames[new_val];
             if(is_supported(CameraParameters::KEY_SUPPORTED_FOCUS_MODES, mode_name))
-                params.set(CameraParameters::KEY_FOCUS_MODE, mode_name);
+                params->set(CameraParameters::KEY_FOCUS_MODE, mode_name);
             else
                 LOGE("Focus mode %s is not supported.", mode_name);
-        } else {
-            LOGE("Focus mode value not in valid range.");
         }
+        else
+            LOGE("Focus mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
     }
     break;
     case ANDROID_CAMERA_PROPERTY_WHITE_BALANCE:
     {
         int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_WHITE_BALANCE_MODES_NUM)
+        {
             const char* mode_name = whiteBalanceModesNames[new_val];
             if(is_supported(CameraParameters::KEY_SUPPORTED_WHITE_BALANCE, mode_name))
-                params.set(CameraParameters::KEY_WHITE_BALANCE, mode_name);
+                params->set(CameraParameters::KEY_WHITE_BALANCE, mode_name);
             else
                 LOGE("White balance mode %s is not supported.", mode_name);
-        } else {
-            LOGE("White balance mode value not in valid range.");
         }
+        else
+            LOGE("White balance mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
     }
     break;
     case ANDROID_CAMERA_PROPERTY_ANTIBANDING:
     {
         int new_val = (int)value;
-        if(new_val >= 0 && new_val < ANDROID_CAMERA_ANTIBANDING_MODES_NUM){
+        if(new_val >= 0 && new_val < ANDROID_CAMERA_ANTIBANDING_MODES_NUM)
+        {
             const char* mode_name = antibandingModesNames[new_val];
             if(is_supported(CameraParameters::KEY_SUPPORTED_ANTIBANDING, mode_name))
-                params.set(CameraParameters::KEY_ANTIBANDING, mode_name);
+                params->set(CameraParameters::KEY_ANTIBANDING, mode_name);
             else
                 LOGE("Antibanding mode %s is not supported.", mode_name);
-        } else {
-            LOGE("Antibanding mode value not in valid range.");
         }
+        else
+            LOGE("Antibanding mode value not in valid range.");
+
+        camera->setParameters(params->flatten());
     }
     break;
+#if !defined(ANDROID_r2_2_0) && !defined(ANDROID_r2_3_3) && !defined(ANDROID_r3_0_1)
+    case ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK:
+    {
+        if (is_supported(CameraParameters::KEY_AUTO_EXPOSURE_LOCK_SUPPORTED, "true"))
+        {
+            if (value != 0)
+                params->set(CameraParameters::KEY_AUTO_EXPOSURE_LOCK, CameraParameters::TRUE);
+            else
+                params->set(CameraParameters::KEY_AUTO_EXPOSURE_LOCK, CameraParameters::FALSE);
+            LOGE("Expose lock is set");
+        }
+        else
+            LOGE("Expose lock is not supported");
+
+        camera->setParameters(params->flatten());
+    }
+    break;
+    case ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK:
+    {
+        if (is_supported(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK_SUPPORTED, "true"))
+        {
+            if (value != 0)
+                params->set(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK, CameraParameters::TRUE);
+            else
+                params->set(CameraParameters::KEY_AUTO_WHITEBALANCE_LOCK, CameraParameters::FALSE);
+            LOGE("White balance lock is set");
+        }
+        else
+            LOGE("White balance lock is not supported");
+
+        camera->setParameters(params->flatten());
+    }
+    break;
+#endif
     default:
         LOGW("CameraHandler::setProperty - Unsupported property.");
     };
+
+    params_str = camera->getParameters();
+    LOGI("Params after set: [%s]", params_str.string());
 }
 
 void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
@@ -935,7 +1047,10 @@ void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
         return;
     }
 
-    CameraParameters curCameraParameters((*ppcameraHandler)->params.flatten());
+    // delayed resolution setup to exclude errors during other parameres setup on the fly
+    // without camera restart
+    if (((*ppcameraHandler)->width != 0) && ((*ppcameraHandler)->height != 0))
+        (*ppcameraHandler)->params->setPreviewSize((*ppcameraHandler)->width, (*ppcameraHandler)->height);
 
 #if defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3) || defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) \
  || defined(ANDROID_r4_3_0) || defined(ANDROID_r4_4_0)
@@ -951,27 +1066,27 @@ void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
         return;
     }
 
-    handler->camera->setParameters(curCameraParameters.flatten());
-    handler->params.unflatten(curCameraParameters.flatten());
+    handler->camera->setParameters((*ppcameraHandler)->params->flatten());
 
     status_t bufferStatus;
 # if defined(ANDROID_r4_0_0) || defined(ANDROID_r4_0_3)
-    sp<SurfaceTexture> surfaceTexture = new SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
-    bufferStatus = handler->camera->setPreviewTexture(surfaceTexture);
+    void* surface_texture_obj = operator new(sizeof(SurfaceTexture) + MAGIC_TAIL);
+    handler->surface = new(surface_texture_obj) SurfaceTexture(MAGIC_OPENCV_TEXTURE_ID);
+    bufferStatus = handler->camera->setPreviewTexture(handler->surface);
     if (bufferStatus != 0)
         LOGE("applyProperties: failed setPreviewTexture call (status %d); camera might not work correctly", bufferStatus);
 # elif defined(ANDROID_r4_1_1) || defined(ANDROID_r4_2_0) || defined(ANDROID_r4_3_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<BufferQueue::ConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener);
-    bufferStatus = handler->camera->setPreviewTexture(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    handler->queue->consumerConnect(handler->listener);
+    bufferStatus = handler->camera->setPreviewTexture(handler->queue);
     if (bufferStatus != 0)
         LOGE("applyProperties: failed setPreviewTexture call; camera might not work correctly");
 # elif defined(ANDROID_r4_4_0)
-    sp<BufferQueue> bufferQueue = new BufferQueue();
-    sp<IConsumerListener> queueListener = new ConsumerListenerStub();
-    bufferQueue->consumerConnect(queueListener, true);
-    bufferStatus = handler->camera->setPreviewTarget(bufferQueue);
+    void* buffer_queue_obj = operator new(sizeof(BufferQueue) + MAGIC_TAIL);
+    handler->queue = new(buffer_queue_obj) BufferQueue();
+    handler->queue->consumerConnect(handler->listener, true);
+    bufferStatus = handler->camera->setPreviewTarget(handler->queue);
     if (bufferStatus != 0)
         LOGE("applyProperties: failed setPreviewTexture call; camera might not work correctly");
 # endif
@@ -1002,7 +1117,7 @@ void CameraHandler::applyProperties(CameraHandler** ppcameraHandler)
     LOGD("CameraHandler::applyProperties(): after previousCameraHandler->closeCameraConnect");
 
     LOGD("CameraHandler::applyProperties(): before initCameraConnect");
-    CameraHandler* handler=initCameraConnect(cameraCallback, cameraId, userData, &curCameraParameters);
+    CameraHandler* handler=initCameraConnect(cameraCallback, cameraId, userData, (*ppcameraHandler)->params);
     LOGD("CameraHandler::applyProperties(): after initCameraConnect, handler=0x%x", (int)handler);
     if (handler == NULL) {
         LOGE("ERROR in applyProperties --- cannot reinit camera");
diff --git a/modules/androidcamera/include/camera_properties.h b/modules/androidcamera/include/camera_properties.h
index 2fec745fa..65499be2d 100644
--- a/modules/androidcamera/include/camera_properties.h
+++ b/modules/androidcamera/include/camera_properties.h
@@ -15,7 +15,9 @@ enum {
     ANDROID_CAMERA_PROPERTY_FOCAL_LENGTH = 105,
     ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_NEAR = 106,
     ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_OPTIMAL = 107,
-    ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_FAR = 108
+    ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_FAR = 108,
+    ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK = 109,
+    ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK = 110
 };
 
 
@@ -30,12 +32,12 @@ enum {
 
 enum {
     ANDROID_CAMERA_FOCUS_MODE_AUTO = 0,
-    ANDROID_CAMERA_FOCUS_MODE_CONTINUOUS_PICTURE,
     ANDROID_CAMERA_FOCUS_MODE_CONTINUOUS_VIDEO,
     ANDROID_CAMERA_FOCUS_MODE_EDOF,
     ANDROID_CAMERA_FOCUS_MODE_FIXED,
     ANDROID_CAMERA_FOCUS_MODE_INFINITY,
     ANDROID_CAMERA_FOCUS_MODE_MACRO,
+    ANDROID_CAMERA_FOCUS_MODE_CONTINUOUS_PICTURE,
     ANDROID_CAMERA_FOCUS_MODES_NUM
 };
 
diff --git a/modules/calib3d/src/opencl/stereobm.cl b/modules/calib3d/src/opencl/stereobm.cl
index a746c8950..73402a6a1 100644
--- a/modules/calib3d/src/opencl/stereobm.cl
+++ b/modules/calib3d/src/opencl/stereobm.cl
@@ -147,6 +147,8 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
     __local int best_disp[2];
     __local int best_cost[2];
     best_cost[nthread] = MAX_VAL;
+    best_disp[nthread] = MAX_VAL;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     short costbuf[wsz];
     int head = 0;
@@ -159,7 +161,7 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
     int costIdx = calcLocalIdx(lx, ly, d, sizeY);
     cost = costFunc + costIdx;
 
-    short tempcost = 0;
+    int tempcost = 0;
     if(x < cols-wsz2-mindisp && y < rows-wsz2)
     {
         int shift = 1*nthread + cols*(1-nthread);
@@ -191,7 +193,7 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if(best_cost[1] == tempcost)
-        best_disp[1] = ndisp - d - 1;
+        atomic_min(best_disp + 1, ndisp - d - 1);
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short));
@@ -209,6 +211,7 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
         y = (ly < sizeY) ? gy + shiftY + ly : rows;
 
         best_cost[nthread] = MAX_VAL;
+        best_disp[nthread] = MAX_VAL;
         barrier(CLK_LOCAL_MEM_FENCE);
 
         costIdx = calcLocalIdx(lx, ly, d, sizeY);
@@ -227,12 +230,11 @@ __kernel void stereoBM(__global const uchar * leftptr, __global const uchar * ri
         barrier(CLK_LOCAL_MEM_FENCE);
 
         if(best_cost[nthread] == tempcost)
-            best_disp[nthread] = ndisp - d - 1;
+            atomic_min(best_disp + nthread, ndisp - d - 1);
         barrier(CLK_LOCAL_MEM_FENCE);
 
         int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short));
         disp = (__global short *)(dispptr + dispIdx);
-
         calcDisp(cost, disp, uniquenessRatio, mindisp, ndisp, 2*sizeY,
             best_disp + nthread, best_cost + nthread, d, x, y, cols, rows, wsz2);
         barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/modules/core/doc/old_basic_structures.rst b/modules/core/doc/old_basic_structures.rst
index 906306dcb..6f26c76c0 100644
--- a/modules/core/doc/old_basic_structures.rst
+++ b/modules/core/doc/old_basic_structures.rst
@@ -1387,7 +1387,7 @@ description rewritten using
 
     IplImage* color_img = cvCreateImage(cvSize(320,240), IPL_DEPTH_8U, 3);
     IplImage gray_img_hdr, *gray_img;
-    gray_img = (IplImage*)cvReshapeND(color_img, &gray_img_hdr, 1, 0, 0);
+    gray_img = (IplImage*)cvReshapeMatND(color_img, sizeof(gray_img_hdr), &gray_img_hdr, 1, 0, 0);
 
     ...
 
@@ -1395,6 +1395,18 @@ description rewritten using
     int size[] = { 2, 2, 2 };
     CvMatND* mat = cvCreateMatND(3, size, CV_32F);
     CvMat row_header, *row;
+    row = (CvMat*)cvReshapeMatND(mat, sizeof(row_header), &row_header, 0, 1, 0);
+
+..
+
+In C, the header file for this function includes a convenient macro ``cvReshapeND`` that does away with the ``sizeof_header`` parameter. So, the lines containing the call to ``cvReshapeMatND`` in the examples may be replaced as follow:
+
+::
+
+    gray_img = (IplImage*)cvReshapeND(color_img, &gray_img_hdr, 1, 0, 0);
+
+    ...
+
     row = (CvMat*)cvReshapeND(mat, &row_header, 0, 1, 0);
 
 ..
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 33167fab2..6b8368fd5 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -118,6 +118,8 @@ public:
 
     virtual int kind() const;
     virtual int dims(int i=-1) const;
+    virtual int cols(int i=-1) const;
+    virtual int rows(int i=-1) const;
     virtual Size size(int i=-1) const;
     virtual int sizend(int* sz, int i=-1) const;
     virtual bool sameSize(const _InputArray& arr) const;
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index fb9ec24c5..fdb6f9a0a 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -592,7 +592,7 @@ protected:
 CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
 CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
-CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1);
+CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
 CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
 CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
                                          InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 5672c02ad..58442e559 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1299,7 +1299,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
     bool haveMask = !_mask.empty();
 
-    if( ((haveMask || haveScalar) && cn > 4) )
+    if ( (haveMask || haveScalar) && cn > 4 )
         return false;
 
     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
@@ -1320,14 +1320,11 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
             "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
             oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(depth1, 1)),
-            ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(depth2, 1)),
-            ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(ddepth, 1)),
-            ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
+            ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
+            ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
+            ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
             ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
-            ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), wdepth,
+            ocl::typeToStr(wdepth), wdepth,
             ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
             ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
             ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
@@ -1347,7 +1344,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     }
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
-    if( k.empty() )
+    if (k.empty())
         return false;
 
     UMat src1 = _src1.getUMat(), src2;
@@ -1388,12 +1385,12 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
         if( !haveMask )
         {
-            if(n == 0)
+            if (n == 0)
                 k.args(src1arg, src2arg, dstarg);
-            else if(n == 1)
+            else if (n == 1)
                 k.args(src1arg, src2arg, dstarg,
                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
-            else if(n == 3)
+            else if (n == 3)
                 k.args(src1arg, src2arg, dstarg,
                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
                        ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
@@ -2621,53 +2618,37 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
 {
     const ocl::Device& dev = ocl::Device::getDefault();
     bool doubleSupport = dev.doubleFPConfig() > 0;
-    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
-    int type2 = _src2.type();
-
-    if (!haveScalar)
-    {
-        if ( (!doubleSupport && (depth1 == CV_64F || _src2.depth() == CV_64F)) ||
-            !_src1.sameSize(_src2) || type1 != type2)
-            return false;
-    }
-    else
-    {
-        if (cn > 1 || depth1 <= CV_32S) // FIXIT: if (cn > 4): Need to clear CPU-based compare behavior
-            return false;
-    }
+    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
+            type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);
 
     if (!doubleSupport && depth1 == CV_64F)
         return false;
 
+    if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
+            return false;
+
     int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
     // Workaround for bug with "?:" operator in AMD OpenCL compiler
-    bool workaroundForAMD = /*dev.isAMD() &&*/
-            (
-                (depth1 != CV_8U && depth1 != CV_8S)
-            );
-    if (workaroundForAMD)
+    if (depth1 >= CV_16U)
         kercn = 1;
 
     int scalarcn = kercn == 3 ? 4 : kercn;
-
     const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
     char cvt[40];
 
-    String buildOptions = format(
-            "-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
-            " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
-            " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
-            (haveScalar ? "UNARY_OP" : "BINARY_OP"),
-            ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
-            ocl::typeToStr(CV_8UC(kercn)), kercn,
-            ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
-            operationMap[op],
-            ocl::typeToStr(depth1), ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
-            ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
-            doubleSupport ? " -D DOUBLE_SUPPORT" : ""
-            );
+    String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
+                         " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
+                         " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s%s",
+                         haveScalar ? "UNARY_OP" : "BINARY_OP",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
+                         ocl::typeToStr(CV_8UC(kercn)), kercn,
+                         ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
+                         operationMap[op], ocl::typeToStr(depth1),
+                         ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
+                         ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "");
 
-    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, buildOptions);
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
     if (k.empty())
         return false;
 
@@ -2678,24 +2659,43 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
 
     if (haveScalar)
     {
-        size_t esz = CV_ELEM_SIZE1(type1)*scalarcn;
-        double buf[4]={0,0,0,0};
-        Mat src2sc = _src2.getMat();
+        size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
+        double buf[4] = { 0, 0, 0, 0 };
+        Mat src2 = _src2.getMat();
 
-        if (!src2sc.empty())
-            convertAndUnrollScalar(src2sc, type1, (uchar*)buf, 1);
+        if( depth1 > CV_32S )
+            convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
+        else
+        {
+            double fval = 0;
+            getConvertFunc(depth2, CV_64F)(src2.data, 0, 0, 0, (uchar *)&fval, 0, Size(1, 1), 0);
+            if( fval < getMinVal(depth1) )
+                return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
+
+            if( fval > getMaxVal(depth1) )
+                return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;
+
+            int ival = cvRound(fval);
+            if( fval != ival )
+            {
+                if( op == CMP_LT || op == CMP_GE )
+                    ival = cvCeil(fval);
+                else if( op == CMP_LE || op == CMP_GT )
+                    ival = cvFloor(fval);
+                else
+                    return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
+            }
+            convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
+        }
 
         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
 
         k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
-               ocl::KernelArg::WriteOnly(dst, cn, kercn),
-               scalararg);
+               ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
     }
     else
     {
-        CV_DbgAssert(type1 == type2);
         UMat src2 = _src2.getUMat();
-        CV_DbgAssert(size == src2.size());
 
         k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
                ocl::KernelArg::ReadOnlyNoSize(src2),
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 0139f6a5b..cd5cf9b73 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -415,42 +415,54 @@ namespace cv {
 
 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
 {
-    std::vector<UMat> src;
+    std::vector<UMat> src, ksrc;
     _mv.getUMatVector(src);
     CV_Assert(!src.empty());
 
     int type = src[0].type(), depth = CV_MAT_DEPTH(type);
     Size size = src[0].size();
 
-    size_t srcsize = src.size();
-    for (size_t i = 0; i < srcsize; ++i)
+    for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
     {
-        int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype);
-        if (src[i].dims > 2 || icn != 1)
+        int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
+                esz1 = CV_ELEM_SIZE1(idepth);
+        if (src[i].dims > 2)
             return false;
-        CV_Assert(size == src[i].size() && depth == idepth);
-    }
 
-    String srcargs, srcdecl, processelem;
-    for (size_t i = 0; i < srcsize; ++i)
+        CV_Assert(size == src[i].size() && depth == idepth);
+
+        for (int cn = 0; cn < icn; ++cn)
+        {
+            UMat tsrc = src[i];
+            tsrc.offset += cn * esz1;
+            ksrc.push_back(tsrc);
+        }
+    }
+    int dcn = (int)ksrc.size();
+
+    String srcargs, srcdecl, processelem, cndecl;
+    for (int i = 0; i < dcn; ++i)
     {
         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
         srcdecl += format("DECLARE_DATA(%d)", i);
         processelem += format("PROCESS_ELEM(%d)", i);
+        cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
     }
 
     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
-                  format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s",
-                         (int)srcsize, ocl::memopTypeToStr(depth), srcargs.c_str(), srcdecl.c_str(), processelem.c_str()));
+                  format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
+                         " -D DECLARE_DATA_N=%s -D PROCESS_ELEMS_N=%s%s",
+                         dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
+                         srcdecl.c_str(), processelem.c_str(), cndecl.c_str()));
     if (k.empty())
         return false;
 
-    _dst.create(size, CV_MAKE_TYPE(depth, (int)srcsize));
+    _dst.create(size, CV_MAKE_TYPE(depth, dcn));
     UMat dst = _dst.getUMat();
 
     int argidx = 0;
-    for (size_t i = 0; i < srcsize; ++i)
-        argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(src[i]));
+    for (int i = 0; i < dcn; ++i)
+        argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
     k.set(argidx, ocl::KernelArg::WriteOnly(dst));
 
     size_t globalsize[2] = { dst.cols, dst.rows };
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index e3e959c95..5ac5f22c5 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -482,9 +482,9 @@ enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS
 static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
 {
     CV_Assert(flipCode >= - 1 && flipCode <= 1);
-    int type = _src.type(), cn = CV_MAT_CN(type), flipType;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), flipType;
 
-    if (cn > 4 || cn == 3)
+    if (cn > 4)
         return false;
 
     const char * kernelName;
@@ -506,7 +506,8 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
     }
 
     ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
-        format( "-D type=%s", ocl::memopTypeToStr(type)));
+        format( "-D T=%s -D T1=%s -D cn=%d", ocl::memopTypeToStr(type),
+                ocl::memopTypeToStr(depth), cn));
     if (k.empty())
         return false;
 
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 095460c78..16df02caf 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -2041,7 +2041,7 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
     const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+                  format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(depth),
                          op, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
     if (k.empty())
         return false;
@@ -2081,7 +2081,7 @@ void pow( InputArray _src, double power, OutputArray _dst )
     {
         if( ipower < 0 )
         {
-            divide( 1., _src, _dst );
+            divide( Scalar::all(1), _src, _dst );
             if( ipower == -1 )
                 return;
             ipower = -ipower;
@@ -2115,10 +2115,7 @@ void pow( InputArray _src, double power, OutputArray _dst )
 
     Mat src, dst;
     if (same)
-    {
-        dst = _dst.getMat();
-        src = dst;
-    }
+        src = dst = _dst.getMat();
     else
     {
         src = _src.getMat();
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index db1ce760f..45ae3d512 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1416,6 +1416,16 @@ int _InputArray::kind() const
     return flags & KIND_MASK;
 }
 
+int _InputArray::rows(int i) const
+{
+    return size(i).height;
+}
+
+int _InputArray::cols(int i) const
+{
+    return size(i).width;
+}
+
 Size _InputArray::size(int i) const
 {
     int k = kind();
@@ -2078,45 +2088,45 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
 
-void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
+void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTransposed, int fixedDepthMask) const
 {
     int k = kind();
     if( k == MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((Mat*)obj)->size.operator()() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((Mat*)obj)->type() == mtype);
-        ((Mat*)obj)->create(rows, cols, mtype);
+        ((Mat*)obj)->create(_rows, _cols, mtype);
         return;
     }
     if( k == UMAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((UMat*)obj)->size.operator()() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((UMat*)obj)->type() == mtype);
-        ((UMat*)obj)->create(rows, cols, mtype);
+        ((UMat*)obj)->create(_rows, _cols, mtype);
         return;
     }
     if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
-        ((cuda::GpuMat*)obj)->create(rows, cols, mtype);
+        ((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
         return;
     }
     if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
-        ((ogl::Buffer*)obj)->create(rows, cols, mtype);
+        ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
         return;
     }
     if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
     {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows));
         CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(rows, cols, mtype);
+        ((cuda::CudaMem*)obj)->create(_rows, _cols, mtype);
         return;
     }
-    int sizes[] = {rows, cols};
+    int sizes[] = {_rows, _cols};
     create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
 
@@ -2679,17 +2689,17 @@ namespace cv {
 
 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 {
-    int type = _m.type(), cn = CV_MAT_CN(type);
-    if (cn == 3)
-        return false;
+    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
+            sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn);
 
     ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
-                  format("-D T=%s", ocl::memopTypeToStr(type)));
+                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s", ocl::memopTypeToStr(type),
+                         ocl::memopTypeToStr(depth), cn, ocl::memopTypeToStr(sctype)));
     if (k.empty())
         return false;
 
     UMat m = _m.getUMat();
-    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, type, s)));
+    k.args(ocl::KernelArg::WriteOnly(m), ocl::KernelArg::Constant(Mat(1, 1, sctype, s)));
 
     size_t globalsize[2] = { m.cols, m.rows };
     return k.run(2, globalsize, NULL, false);
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 7c4f8de9e..ffea804ed 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1410,7 +1410,7 @@ bool useOpenCL()
 {
     CoreTLSData* data = coreTlsData.get();
     if( data->useOpenCL < 0 )
-        data->useOpenCL = (int)haveOpenCL();
+        data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
     return data->useOpenCL > 0;
 }
 
@@ -1419,7 +1419,7 @@ void setUseOpenCL(bool flag)
     if( haveOpenCL() )
     {
         CoreTLSData* data = coreTlsData.get();
-        data->useOpenCL = flag ? 1 : 0;
+        data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
     }
 }
 
@@ -2179,7 +2179,6 @@ static cl_device_id selectOpenCLDevice()
             goto not_found;
         }
     }
-
     if (deviceTypes.size() == 0)
     {
         if (!isID)
@@ -2193,13 +2192,16 @@ static cl_device_id selectOpenCLDevice()
     for (size_t t = 0; t < deviceTypes.size(); t++)
     {
         int deviceType = 0;
-        if (deviceTypes[t] == "GPU")
+        std::string tempStrDeviceType = deviceTypes[t];
+        std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
+
+        if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
             deviceType = Device::TYPE_GPU;
-        else if (deviceTypes[t] == "CPU")
+        else if (tempStrDeviceType == "cpu")
             deviceType = Device::TYPE_CPU;
-        else if (deviceTypes[t] == "ACCELERATOR")
+        else if (tempStrDeviceType == "accelerator")
             deviceType = Device::TYPE_ACCELERATOR;
-        else if (deviceTypes[t] == "ALL")
+        else if (tempStrDeviceType == "all")
             deviceType = Device::TYPE_ALL;
         else
         {
@@ -2229,7 +2231,14 @@ static cl_device_id selectOpenCLDevice()
         {
             std::string name;
             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
-            if (isID || name.find(deviceName) != std::string::npos)
+            cl_bool useGPU = true;
+            if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
+            {
+                cl_bool isIGPU = CL_FALSE;
+                clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
+                useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
+            }
+            if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
             {
                 // TODO check for OpenCL 1.1
                 return devices[i];
@@ -2245,6 +2254,7 @@ not_found:
         std::cerr << deviceTypes[t] << " ";
 
     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
+    CV_Error(CL_INVALID_DEVICE, "Requested OpenCL device is not found");
     return NULL;
 }
 
@@ -4306,7 +4316,7 @@ static std::string kerToStr(const Mat & k)
     return stream.str();
 }
 
-String kernelToStr(InputArray _kernel, int ddepth)
+String kernelToStr(InputArray _kernel, int ddepth, const char * name)
 {
     Mat kernel = _kernel.getMat().reshape(1, 1);
 
@@ -4317,13 +4327,13 @@ String kernelToStr(InputArray _kernel, int ddepth)
     if (ddepth != depth)
         kernel.convertTo(kernel, ddepth);
 
-    typedef std::string (*func_t)(const Mat &);
-    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>,kerToStr<short>,
+    typedef std::string (* func_t)(const Mat &);
+    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
     const func_t func = funcs[depth];
     CV_Assert(func != 0);
 
-    return cv::format(" -D COEFF=%s", func(kernel).c_str());
+    return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
 }
 
 #define PROCESS_SRC(src) \
@@ -4347,7 +4357,7 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
                               InputArray src4, InputArray src5, InputArray src6,
                               InputArray src7, InputArray src8, InputArray src9)
 {
-    int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(depth);
     Size ssize = src1.size();
     const ocl::Device & d = ocl::Device::getDefault();
 
@@ -4371,7 +4381,8 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
     PROCESS_SRC(src9);
 
     size_t size = offsets.size();
-    std::vector<int> dividers(size, width);
+    int wsz = width * esz;
+    std::vector<int> dividers(size, wsz);
 
     for (size_t i = 0; i < size; ++i)
         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
@@ -4379,7 +4390,7 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
 
     // default strategy
     for (size_t i = 0; i < size; ++i)
-        if (dividers[i] != width)
+        if (dividers[i] != wsz)
         {
             width = 1;
             break;
diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl
index 0c874dbe6..bacfe7adf 100644
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@@ -39,10 +39,18 @@
 //
 //M*/
 
-#define sizeoftype ((int)sizeof(type))
+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#endif
 
-__kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int srcoffset,
-                               __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int src_offset,
+                               __global uchar * dstptr, int dst_step, int dst_offset,
                                int rows, int cols, int thread_rows, int thread_cols)
 {
     int x = get_global_id(0);
@@ -50,19 +58,16 @@ __kernel void arithm_flip_rows(__global const uchar* srcptr, int srcstep, int sr
 
     if (x < cols && y < thread_rows)
     {
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x, sizeoftype, srcoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));
 
-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x, sizeoftype, dstoffset)));
-
-        dst0[0] = src1[0];
-        dst1[0] = src0[0];
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+        storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
     }
 }
 
-__kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, int srcoffset,
-                                    __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int src_offset,
+                                    __global uchar * dstptr, int dst_step, int dst_offset,
                                     int rows, int cols, int thread_rows, int thread_cols)
 {
     int x = get_global_id(0);
@@ -71,19 +76,16 @@ __kernel void arithm_flip_rows_cols(__global const uchar* srcptr, int srcstep, i
     if (x < cols && y < thread_rows)
     {
         int x1 = cols - x - 1;
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(rows - y - 1, srcstep, mad24(x1, sizeoftype, srcoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));
 
-        __global type* dst0 = (__global type*)(dstptr + mad24(rows - y - 1, dststep, mad24(x1, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
-
-        dst0[0] = src0[0];
-        dst1[0] = src1[0];
+        storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
     }
 }
 
-__kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int srcoffset,
-                               __global uchar* dstptr, int dststep, int dstoffset,
+__kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int src_offset,
+                               __global uchar * dstptr, int dst_step, int dst_offset,
                                int rows, int cols, int thread_rows, int thread_cols)
 {
     int x = get_global_id(0);
@@ -92,13 +94,10 @@ __kernel void arithm_flip_cols(__global const uchar* srcptr, int srcstep, int sr
     if (x < thread_cols && y < rows)
     {
         int x1 = cols - x - 1;
-        __global const type* src0 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x, sizeoftype, srcoffset)));
-        __global const type* src1 = (__global const type*)(srcptr + mad24(y, srcstep, mad24(x1, sizeoftype, srcoffset)));
+        T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
+        T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));
 
-        __global type* dst0 = (__global type*)(dstptr + mad24(y, dststep, mad24(x1, sizeoftype, dstoffset)));
-        __global type* dst1 = (__global type*)(dstptr + mad24(y, dststep, mad24(x, sizeoftype, dstoffset)));
-
-        dst1[0] = src1[0];
-        dst0[0] = src0[0];
+        storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
+        storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
     }
 }
diff --git a/modules/core/src/opencl/set_identity.cl b/modules/core/src/opencl/set_identity.cl
index d63ce793d..0e8f1424f 100644
--- a/modules/core/src/opencl/set_identity.cl
+++ b/modules/core/src/opencl/set_identity.cl
@@ -43,17 +43,28 @@
 //
 //M*/
 
+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#define scalar scalar_
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+
 __kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
-                          T scalar)
+                          ST scalar_)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset));
-        __global T * src = (__global T *)(srcptr + src_index);
+        int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
 
-        src[0] = x == y ? scalar : (T)(0);
+        storepix(x == y ? scalar : (T)(0), srcptr + src_index);
     }
 }
diff --git a/modules/core/src/opencl/split_merge.cl b/modules/core/src/opencl/split_merge.cl
index 8a1bc4903..7d277056a 100644
--- a/modules/core/src/opencl/split_merge.cl
+++ b/modules/core/src/opencl/split_merge.cl
@@ -45,7 +45,7 @@
 
 #define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
 #define DECLARE_DATA(index) __global const T * src##index = \
-    (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T), src##index##_offset)));
+    (__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset)));
 #define PROCESS_ELEM(index) dst[index] = src##index[0];
 
 __kernel void merge(DECLARE_SRC_PARAMS_N
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index 3755eccf7..293035f89 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -4824,7 +4824,7 @@ cvRegisterType( const CvTypeInfo* _info )
             "Type name should contain only letters, digits, - and _" );
     }
 
-    info = (CvTypeInfo*)malloc( sizeof(*info) + len + 1 );
+    info = (CvTypeInfo*)cvAlloc( sizeof(*info) + len + 1 );
 
     *info = *_info;
     info->type_name = (char*)(info + 1);
@@ -4862,7 +4862,7 @@ cvUnregisterType( const char* type_name )
         if( !CvType::first || !CvType::last )
             CvType::first = CvType::last = 0;
 
-        free( info );
+        cvFree( &info );
     }
 }
 
@@ -5486,11 +5486,27 @@ internal::WriteStructContext::WriteStructContext(FileStorage& _fs,
 {
     cvStartWriteStruct(**fs, !name.empty() ? name.c_str() : 0, flags,
                        !typeName.empty() ? typeName.c_str() : 0);
+    fs->elname = String();
+    if ((flags & FileNode::TYPE_MASK) == FileNode::SEQ)
+    {
+        fs->state = FileStorage::VALUE_EXPECTED;
+        fs->structs.push_back('[');
+    }
+    else
+    {
+        fs->state = FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP;
+        fs->structs.push_back('{');
+    }
 }
 
 internal::WriteStructContext::~WriteStructContext()
 {
     cvEndWriteStruct(**fs);
+    fs->structs.pop_back();
+    fs->state = fs->structs.empty() || fs->structs.back() == '{' ?
+        FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP :
+        FileStorage::VALUE_EXPECTED;
+    fs->elname = String();
 }
 
 
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index d8d8ae632..1e6f592d5 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -414,24 +414,23 @@ const String& getBuildInformation()
 
 String format( const char* fmt, ... )
 {
-    char buf[1024];
+    AutoBuffer<char, 1024> buf;
 
-    va_list va;
-    va_start(va, fmt);
-    int len = vsnprintf(buf, sizeof(buf), fmt, va);
-    va_end(va);
-
-    if (len >= (int)sizeof(buf))
+    for ( ; ; )
     {
-        String s(len, '\0');
+        va_list va;
         va_start(va, fmt);
-        len = vsnprintf((char*)s.c_str(), len + 1, fmt, va);
-        (void)len;
+        int bsize = static_cast<int>(buf.size()),
+                len = vsnprintf((char *)buf, bsize, fmt, va);
         va_end(va);
-        return s;
-    }
 
-    return String(buf, len);
+        if (len < 0 || len >= bsize)
+        {
+            buf.resize(std::max(bsize << 1, len + 1));
+            continue;
+        }
+        return String((char *)buf, len);
+    }
 }
 
 String tempfile( const char* suffix )
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 44cb3f44b..006049254 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -88,8 +88,10 @@ void UMatData::unlock()
 
 MatAllocator* UMat::getStdAllocator()
 {
-    if( ocl::haveOpenCL() )
+#ifdef HAVE_OPENCL
+    if( ocl::haveOpenCL() && ocl::useOpenCL() )
         return ocl::getOpenCLAllocator();
+#endif
     return Mat::getStdAllocator();
 }
 
@@ -665,7 +667,7 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
         copyTo(_dst);
         return;
     }
-
+#ifdef HAVE_OPENCL
     int cn = channels(), mtype = _mask.type(), mdepth = CV_MAT_DEPTH(mtype), mcn = CV_MAT_CN(mtype);
     CV_Assert( mdepth == CV_8U && (mcn == 1 || mcn == cn) );
 
@@ -692,7 +694,7 @@ void UMat::copyTo(OutputArray _dst, InputArray _mask) const
                 return;
         }
     }
-
+#endif
     Mat src = getMat(ACCESS_READ);
     src.copyTo(_dst, _mask);
 }
@@ -713,7 +715,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
         copyTo(_dst);
         return;
     }
-
+#ifdef HAVE_OPENCL
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
     bool needDouble = sdepth == CV_64F || ddepth == CV_64F;
     if( dims <= 2 && cn && _dst.isUMat() && ocl::useOpenCL() &&
@@ -748,7 +750,7 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
                 return;
         }
     }
-
+#endif
     Mat m = getMat(ACCESS_READ);
     m.convertTo(_dst, _type, alpha, beta);
 }
@@ -756,7 +758,9 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
 UMat& UMat::setTo(InputArray _value, InputArray _mask)
 {
     bool haveMask = !_mask.empty();
+#ifdef HAVE_OPENCL
     int tp = type(), cn = CV_MAT_CN(tp);
+
     if( dims <= 2 && cn <= 4 && CV_MAT_DEPTH(tp) < CV_64F && ocl::useOpenCL() )
     {
         Mat value = _value.getMat();
@@ -795,6 +799,7 @@ UMat& UMat::setTo(InputArray _value, InputArray _mask)
                 return *this;
         }
     }
+#endif
     Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
     m.setTo(_value, _mask);
     return *this;
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index 4dd8d150c..d2b26e146 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -57,9 +57,9 @@ PARAM_TEST_CASE(Lut, MatDepth, MatDepth, Channels, bool, bool)
     int cn;
     bool use_roi, same_cn;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_INPUT_PARAMETER(lut)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_INPUT_PARAMETER(lut);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -87,14 +87,14 @@ PARAM_TEST_CASE(Lut, MatDepth, MatDepth, Channels, bool, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, dst_type, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_INPUT_PARAMETER(lut)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_INPUT_PARAMETER(lut);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -121,11 +121,11 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
     cv::Scalar val;
     cv::Scalar val_in_range;
 
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(mask)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(mask);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);
 
     virtual void SetUp()
     {
@@ -167,21 +167,21 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
                                       rng.uniform(minV, maxV), rng.uniform(minV, maxV));
         }
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(mask)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(mask);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
     }
 
     void Near(double threshold = 0.)
     {
-        OCL_EXPECT_MATS_NEAR(dst1, threshold)
+        OCL_EXPECT_MATS_NEAR(dst1, threshold);
     }
 
     void Near1(double threshold = 0.)
     {
-        OCL_EXPECT_MATS_NEAR(dst2, threshold)
+        OCL_EXPECT_MATS_NEAR(dst2, threshold);
     }
 };
 
@@ -556,6 +556,12 @@ OCL_TEST_P(Transpose, Mat)
     {
         generateTestData();
 
+        Size roiSize = src1_roi.size();
+        Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst1, dst1_roi, Size(roiSize.height, roiSize.width), dst1Border, src1.type(), 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(dst1);
+
         OCL_OFF(cv::transpose(src1_roi, dst1_roi));
         OCL_ON(cv::transpose(usrc1_roi, udst1_roi));
 
@@ -580,7 +586,7 @@ OCL_TEST_P(Transpose, SquareInplace)
         OCL_OFF(cv::transpose(src1_roi, src1_roi));
         OCL_ON(cv::transpose(usrc1_roi, usrc1_roi));
 
-        OCL_EXPECT_MATS_NEAR(src1, 0)
+        OCL_EXPECT_MATS_NEAR(src1, 0);
     }
 }
 
@@ -761,7 +767,7 @@ OCL_TEST_P(Bitwise_not, Mat)
 typedef ArithmTestBase Compare;
 
 static const int cmp_codes[] = { CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE };
-static const char* cmp_strs[] = { "CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE" };
+static const char * cmp_strs[] = { "CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE" };
 static const int cmp_num = sizeof(cmp_codes) / sizeof(int);
 
 OCL_TEST_P(Compare, Mat)
@@ -826,12 +832,14 @@ OCL_TEST_P(Pow, Mat)
     for (int j = 0; j < test_loop_times; j++)
         for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
         {
+            SCOPED_TRACE(pows[k]);
+
             generateTestData();
 
             OCL_OFF(cv::pow(src1_roi, pows[k], dst1_roi));
             OCL_ON(cv::pow(usrc1_roi, pows[k], udst1_roi));
 
-            Near(1); // FIXIT: Relative error check!
+            OCL_EXPECT_MATS_NEAR_RELATIVE(dst1, 1e-5);
         }
 }
 
@@ -893,8 +901,8 @@ struct RepeatTestCase :
         Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst1, dst1_roi, dstRoiSize, dst1Border, type, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
     }
 };
 
@@ -1450,10 +1458,10 @@ PARAM_TEST_CASE(InRange, MatDepth, Channels, bool /*Scalar or not*/, bool /*Roi*
     bool scalars, use_roi;
     cv::Scalar val1, val2;
 
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(src3);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -1485,15 +1493,15 @@ PARAM_TEST_CASE(InRange, MatDepth, Channels, bool /*Scalar or not*/, bool /*Roi*
         val2 = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
                           rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(src3);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near()
     {
-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
     }
 };
 
@@ -1565,7 +1573,7 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool)
     bool use_roi;
     double value;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_INPUT_PARAMETER(src);
 
     virtual void SetUp()
     {
@@ -1592,12 +1600,12 @@ PARAM_TEST_CASE(PatchNaNs, Channels, bool)
 
         value = randomDouble(-100, 100);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
     }
 
     void Near()
     {
-        OCL_EXPECT_MATS_NEAR(src, 0)
+        OCL_EXPECT_MATS_NEAR(src, 0);
     }
 };
 
@@ -1640,8 +1648,8 @@ PARAM_TEST_CASE(Reduce, std::pair<MatDepth, MatDepth>, Channels, int, bool)
     int sdepth, ddepth, cn, dim, dtype;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -1666,8 +1674,8 @@ PARAM_TEST_CASE(Reduce, std::pair<MatDepth, MatDepth>, Channels, int, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, dtype, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -1683,7 +1691,7 @@ OCL_TEST_P(ReduceSum, Mat)
         OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_SUM, dtype));
 
         double eps = ddepth <= CV_32S ? 1 : 1e-4;
-        OCL_EXPECT_MATS_NEAR(dst, eps)
+        OCL_EXPECT_MATS_NEAR(dst, eps);
     }
 }
 
@@ -1698,7 +1706,7 @@ OCL_TEST_P(ReduceMax, Mat)
         OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_MAX, dtype));
         OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_MAX, dtype));
 
-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
     }
 }
 
@@ -1713,7 +1721,7 @@ OCL_TEST_P(ReduceMin, Mat)
         OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_MIN, dtype));
         OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_MIN, dtype));
 
-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
     }
 }
 
@@ -1729,7 +1737,7 @@ OCL_TEST_P(ReduceAvg, Mat)
         OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_AVG, dtype));
 
         double eps = ddepth <= CV_32S ? 1 : 5e-6;
-        OCL_EXPECT_MATS_NEAR(dst, eps)
+        OCL_EXPECT_MATS_NEAR(dst, eps);
     }
 }
 
diff --git a/modules/core/test/ocl/test_channels.cpp b/modules/core/test/ocl/test_channels.cpp
index f0dc10250..7565273e7 100644
--- a/modules/core/test/ocl/test_channels.cpp
+++ b/modules/core/test/ocl/test_channels.cpp
@@ -54,16 +54,16 @@ namespace ocl {
 
 //////////////////////////////////////// Merge ///////////////////////////////////////////////
 
-PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)
+PARAM_TEST_CASE(Merge, MatDepth, int, bool)
 {
-    int depth, cn;
+    int depth, nsrc;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_INPUT_PARAMETER(src4)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(src3);
+    TEST_DECLARE_INPUT_PARAMETER(src4);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     std::vector<Mat> src_roi;
     std::vector<UMat> usrc_roi;
@@ -71,10 +71,15 @@ PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)
     virtual void SetUp()
     {
         depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
+        nsrc = GET_PARAM(1);
         use_roi = GET_PARAM(2);
 
-        CV_Assert(cn >= 1 && cn <= 4);
+        CV_Assert(nsrc >= 1 && nsrc <= 4);
+    }
+
+    int type()
+    {
+        return CV_MAKE_TYPE(depth, randomInt(1, 3));
     }
 
     void generateTestData()
@@ -83,34 +88,39 @@ PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)
 
         {
             Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src1, src1_roi, roiSize, src1Border, depth, 2, 11);
+            randomSubMat(src1, src1_roi, roiSize, src1Border, type(), 2, 11);
 
             Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src2, src2_roi, roiSize, src2Border, depth, -1540, 1740);
+            randomSubMat(src2, src2_roi, roiSize, src2Border, type(), -1540, 1740);
 
             Border src3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src3, src3_roi, roiSize, src3Border, depth, -1540, 1740);
+            randomSubMat(src3, src3_roi, roiSize, src3Border, type(), -1540, 1740);
 
             Border src4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src4, src4_roi, roiSize, src4Border, depth, -1540, 1740);
+            randomSubMat(src4, src4_roi, roiSize, src4Border, type(), -1540, 1740);
         }
 
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
-
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_INPUT_PARAMETER(src4)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(src3);
+        UMAT_UPLOAD_INPUT_PARAMETER(src4);
 
         src_roi.push_back(src1_roi), usrc_roi.push_back(usrc1_roi);
-        if (cn >= 2)
+        if (nsrc >= 2)
             src_roi.push_back(src2_roi), usrc_roi.push_back(usrc2_roi);
-        if (cn >= 3)
+        if (nsrc >= 3)
             src_roi.push_back(src3_roi), usrc_roi.push_back(usrc3_roi);
-        if (cn >= 4)
+        if (nsrc >= 4)
             src_roi.push_back(src4_roi), usrc_roi.push_back(usrc4_roi);
+
+        int dcn = 0;
+        for (int i = 0; i < nsrc; ++i)
+            dcn += src_roi[i].channels();
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, dcn), 5, 16);
+
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.)
@@ -139,11 +149,11 @@ PARAM_TEST_CASE(Split, MatType, Channels, bool)
     int depth, cn;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst3);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst4);
 
     std::vector<Mat> dst_roi, dst;
     std::vector<UMat> udst_roi, udst;
@@ -177,11 +187,11 @@ PARAM_TEST_CASE(Split, MatType, Channels, bool)
             randomSubMat(dst4, dst4_roi, roiSize, dst4Border, depth, -1540, 1740);
         }
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4);
 
         dst_roi.push_back(dst1_roi), udst_roi.push_back(udst1_roi),
                 dst.push_back(dst1), udst.push_back(udst1);
@@ -221,14 +231,14 @@ PARAM_TEST_CASE(MixChannels, MatType, bool)
     int depth;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_INPUT_PARAMETER(src4)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(src3);
+    TEST_DECLARE_INPUT_PARAMETER(src4);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst3);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst4);
 
     std::vector<Mat> src_roi, dst_roi, dst;
     std::vector<UMat> usrc_roi, udst_roi, udst;
@@ -287,15 +297,15 @@ PARAM_TEST_CASE(MixChannels, MatType, bool)
             randomSubMat(dst4, dst4_roi, roiSize, dst4Border, type(), -1540, 1740);
         }
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_INPUT_PARAMETER(src4)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(src3);
+        UMAT_UPLOAD_INPUT_PARAMETER(src4);
 
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4);
 
         int nsrc = randomInt(1, 5), ndst = randomInt(1, 5);
 
@@ -360,8 +370,8 @@ PARAM_TEST_CASE(InsertChannel, MatDepth, Channels, bool)
     int depth, cn, coi;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -381,8 +391,8 @@ PARAM_TEST_CASE(InsertChannel, MatDepth, Channels, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -406,8 +416,8 @@ PARAM_TEST_CASE(ExtractChannel, MatDepth, Channels, bool)
     int depth, cn, coi;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -427,8 +437,8 @@ PARAM_TEST_CASE(ExtractChannel, MatDepth, Channels, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, depth, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -447,7 +457,7 @@ OCL_TEST_P(ExtractChannel, Accuracy)
 
 //////////////////////////////////////// Instantiation ///////////////////////////////////////////////
 
-OCL_INSTANTIATE_TEST_CASE_P(Channels, Merge, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Channels, Merge, Combine(OCL_ALL_DEPTHS, Values(1, 2, 3, 4), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Channels, Split, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Channels, MixChannels, Combine(OCL_ALL_DEPTHS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Channels, InsertChannel, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
diff --git a/modules/core/test/ocl/test_dft.cpp b/modules/core/test/ocl/test_dft.cpp
index cc9b06d38..1f0e43b20 100644
--- a/modules/core/test/ocl/test_dft.cpp
+++ b/modules/core/test/ocl/test_dft.cpp
@@ -60,8 +60,8 @@ PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
     int	dft_flags, depth;
     bool inplace;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -106,9 +106,9 @@ PARAM_TEST_CASE(MulSpectrums, bool, bool)
 {
     bool ccorr, useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -129,9 +129,9 @@ PARAM_TEST_CASE(MulSpectrums, bool, bool)
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, srcRoiSize, dstBorder, CV_32FC2, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
diff --git a/modules/core/test/ocl/test_gemm.cpp b/modules/core/test/ocl/test_gemm.cpp
index 4d453f334..e92fc2a1c 100644
--- a/modules/core/test/ocl/test_gemm.cpp
+++ b/modules/core/test/ocl/test_gemm.cpp
@@ -67,10 +67,10 @@ PARAM_TEST_CASE(Gemm,
 
     double alpha, beta;
 
-    TEST_DECLARE_INPUT_PARAMETER(A)
-    TEST_DECLARE_INPUT_PARAMETER(B)
-    TEST_DECLARE_INPUT_PARAMETER(C)
-    TEST_DECLARE_OUTPUT_PARAMETER(D)
+    TEST_DECLARE_INPUT_PARAMETER(A);
+    TEST_DECLARE_INPUT_PARAMETER(B);
+    TEST_DECLARE_INPUT_PARAMETER(C);
+    TEST_DECLARE_OUTPUT_PARAMETER(D);
 
     virtual void SetUp()
     {
@@ -119,10 +119,10 @@ PARAM_TEST_CASE(Gemm,
         alpha = randomDouble(-4, 4);
         beta = randomDouble(-4, 4);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(A)
-        UMAT_UPLOAD_INPUT_PARAMETER(B)
-        UMAT_UPLOAD_INPUT_PARAMETER(C)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(D)
+        UMAT_UPLOAD_INPUT_PARAMETER(A);
+        UMAT_UPLOAD_INPUT_PARAMETER(B);
+        UMAT_UPLOAD_INPUT_PARAMETER(C);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(D);
     }
 };
 
diff --git a/modules/core/test/ocl/test_matrix_operation.cpp b/modules/core/test/ocl/test_matrix_operation.cpp
index 77c5dad95..901609538 100644
--- a/modules/core/test/ocl/test_matrix_operation.cpp
+++ b/modules/core/test/ocl/test_matrix_operation.cpp
@@ -59,8 +59,8 @@ PARAM_TEST_CASE(ConvertTo, MatDepth, MatDepth, Channels, bool)
     int src_depth, cn, dstType;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -80,8 +80,8 @@ PARAM_TEST_CASE(ConvertTo, MatDepth, MatDepth, Channels, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -108,9 +108,9 @@ PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
     int depth, cn;
     bool use_roi, use_mask;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_INPUT_PARAMETER(mask)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_INPUT_PARAMETER(mask);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -139,10 +139,10 @@ PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
         if (use_mask)
-            UMAT_UPLOAD_INPUT_PARAMETER(mask)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+            UMAT_UPLOAD_INPUT_PARAMETER(mask);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -169,7 +169,7 @@ OCL_TEST_P(CopyTo, Accuracy)
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            OCL_ALL_DEPTHS, OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+                                OCL_ALL_DEPTHS, OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 
 OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
                                 OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index 23c0aad62..e6c412f86 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -380,6 +380,40 @@ TEST(Core_InputOutput, write_read_consistency) { Core_IOTest test; test.safe_run
 
 extern void testFormatter();
 
+
+struct UserDefinedType
+{
+    int a;
+    float b;
+};
+
+static inline bool operator==(const UserDefinedType &x,
+                              const UserDefinedType &y) {
+    return (x.a == y.a) && (x.b == y.b);
+}
+
+static inline void write(FileStorage &fs,
+                         const String&,
+                         const UserDefinedType &value)
+{
+    fs << "{:" << "a" << value.a << "b" << value.b << "}";
+}
+
+static inline void read(const FileNode& node,
+                        UserDefinedType& value,
+                        const UserDefinedType& default_value
+                          = UserDefinedType()) {
+    if(node.empty())
+    {
+        value = default_value;
+    }
+    else
+    {
+        node["a"] >> value.a;
+        node["b"] >> value.b;
+    }
+}
+
 class CV_MiscIOTest : public cvtest::BaseTest
 {
 public:
@@ -393,11 +427,14 @@ protected:
             string fname = cv::tempfile(".xml");
             vector<int> mi, mi2, mi3, mi4;
             vector<Mat> mv, mv2, mv3, mv4;
+            vector<UserDefinedType> vudt, vudt2, vudt3, vudt4;
             Mat m(10, 9, CV_32F);
             Mat empty;
+            UserDefinedType udt = { 8, 3.3f };
             randu(m, 0, 1);
             mi3.push_back(5);
             mv3.push_back(m);
+            vudt3.push_back(udt);
             Point_<float> p1(1.1f, 2.2f), op1;
             Point3i p2(3, 4, 5), op2;
             Size s1(6, 7), os1;
@@ -412,6 +449,8 @@ protected:
             fs << "mv" << mv;
             fs << "mi3" << mi3;
             fs << "mv3" << mv3;
+            fs << "vudt" << vudt;
+            fs << "vudt3" << vudt3;
             fs << "empty" << empty;
             fs << "p1" << p1;
             fs << "p2" << p2;
@@ -428,6 +467,8 @@ protected:
             fs["mv"] >> mv2;
             fs["mi3"] >> mi4;
             fs["mv3"] >> mv4;
+            fs["vudt"] >> vudt2;
+            fs["vudt3"] >> vudt4;
             fs["empty"] >> empty;
             fs["p1"] >> op1;
             fs["p2"] >> op2;
@@ -442,6 +483,8 @@ protected:
             CV_Assert( norm(mi3, mi4, CV_C) == 0 );
             CV_Assert( mv4.size() == 1 );
             double n = norm(mv3[0], mv4[0], CV_C);
+            CV_Assert( vudt2.empty() );
+            CV_Assert( vudt3 == vudt4 );
             CV_Assert( n == 0 );
             CV_Assert( op1 == p1 );
             CV_Assert( op2 == p2 );
diff --git a/modules/core/test/test_umat.cpp b/modules/core/test/test_umat.cpp
index 60f9ee66a..b7deb4895 100644
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@@ -795,4 +795,176 @@ TEST(UMat, ReadBufferRect)
     EXPECT_MAT_NEAR(t, t2, 0);
 }
 
+// Use iGPU or OPENCV_OPENCL_DEVICE=:CPU: to catch problem
+TEST(UMat, DISABLED_synchronization_map_unmap)
+{
+    class TestParallelLoopBody : public cv::ParallelLoopBody
+    {
+        UMat u_;
+    public:
+        TestParallelLoopBody(const UMat& u) : u_(u) { }
+        void operator() (const cv::Range& range) const
+        {
+            printf("range: %d, %d -- begin\n", range.start, range.end);
+            for (int i = 0; i < 10; i++)
+            {
+                printf("%d: %d map...\n", range.start, i);
+                Mat m = u_.getMat(cv::ACCESS_READ);
+
+                printf("%d: %d unmap...\n", range.start, i);
+                m.release();
+            }
+            printf("range: %d, %d -- end\n", range.start, range.end);
+        }
+    };
+    try
+    {
+        UMat u(1000, 1000, CV_32FC1);
+        parallel_for_(cv::Range(0, 2), TestParallelLoopBody(u));
+    }
+    catch (const cv::Exception& e)
+    {
+        FAIL() << "Exception: " << e.what();
+        ADD_FAILURE();
+    }
+    catch (...)
+    {
+        FAIL() << "Exception!";
+    }
+}
+
 } } // namespace cvtest::ocl
+
+TEST(UMat, DISABLED_bug_with_unmap)
+{
+    for (int i = 0; i < 20; i++)
+    {
+        try
+        {
+            Mat m = Mat(1000, 1000, CV_8UC1);
+            UMat u = m.getUMat(ACCESS_READ);
+            UMat dst;
+            add(u, Scalar::all(0), dst); // start async operation
+            u.release();
+            m.release();
+        }
+        catch (const cv::Exception& e)
+        {
+            printf("i = %d... %s\n", i, e.what());
+            ADD_FAILURE();
+        }
+        catch (...)
+        {
+            printf("i = %d...\n", i);
+            ADD_FAILURE();
+        }
+    }
+}
+
+TEST(UMat, DISABLED_bug_with_unmap_in_class)
+{
+    class Logic
+    {
+    public:
+        Logic() {}
+        void processData(InputArray input)
+        {
+            Mat m = input.getMat();
+            {
+                Mat dst;
+                m.convertTo(dst, CV_32FC1);
+                // some additional CPU-based per-pixel processing into dst
+                intermediateResult = dst.getUMat(ACCESS_READ);
+                std::cout << "data processed..." << std::endl;
+            } // problem is here: dst::~Mat()
+            std::cout << "leave ProcessData()" << std::endl;
+        }
+        UMat getResult() const { return intermediateResult; }
+    protected:
+        UMat intermediateResult;
+    };
+    try
+    {
+        Mat m = Mat(1000, 1000, CV_8UC1);
+        Logic l;
+        l.processData(m);
+        UMat result = l.getResult();
+    }
+    catch (const cv::Exception& e)
+    {
+        printf("exception... %s\n", e.what());
+        ADD_FAILURE();
+    }
+    catch (...)
+    {
+        printf("exception... \n");
+        ADD_FAILURE();
+    }
+}
+
+TEST(UMat, Test_same_behaviour_read_and_read)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_READ);
+        UMat dst;
+        add(u, Scalar::all(1), dst);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_FALSE(exceptionDetected); // no data race, 2+ reads are valid
+}
+
+// VP: this test (and probably others from same_behaviour series) is not valid in my opinion.
+TEST(UMat, DISABLED_Test_same_behaviour_read_and_write)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_READ);
+        add(u, Scalar::all(1), u);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_TRUE(exceptionDetected); // data race
+}
+
+TEST(UMat, DISABLED_Test_same_behaviour_write_and_read)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_WRITE);
+        UMat dst;
+        add(u, Scalar::all(1), dst);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_TRUE(exceptionDetected); // data race
+}
+
+TEST(UMat, DISABLED_Test_same_behaviour_write_and_write)
+{
+    bool exceptionDetected = false;
+    try
+    {
+        UMat u(Size(10, 10), CV_8UC1);
+        Mat m = u.getMat(ACCESS_WRITE);
+        add(u, Scalar::all(1), u);
+    }
+    catch (...)
+    {
+        exceptionDetected = true;
+    }
+    ASSERT_TRUE(exceptionDetected); // data race
+}
diff --git a/modules/cudabgsegm/src/cuda/mog2.cu b/modules/cudabgsegm/src/cuda/mog2.cu
index de8df6c94..789afa47a 100644
--- a/modules/cudabgsegm/src/cuda/mog2.cu
+++ b/modules/cudabgsegm/src/cuda/mog2.cu
@@ -163,7 +163,7 @@ namespace cv { namespace cuda { namespace device
             {
                 //need only weight if fit is found
                 float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
-
+                int swap_count = 0;
                 //fit not found yet
                 if (!fitsPDF)
                 {
@@ -214,6 +214,7 @@ namespace cv { namespace cuda { namespace device
                             if (weight < gmm_weight((i - 1) * frame.rows + y, x))
                                 break;
 
+                            swap_count++;
                             //swap one up
                             swap(gmm_weight, x, y, i - 1, frame.rows);
                             swap(gmm_variance, x, y, i - 1, frame.rows);
@@ -231,7 +232,7 @@ namespace cv { namespace cuda { namespace device
                     nmodes--;
                 }
 
-                gmm_weight(mode * frame.rows + y, x) = weight; //update weight by the calculated value
+                gmm_weight((mode - swap_count) * frame.rows + y, x) = weight; //update weight by the calculated value
                 totalWeight += weight;
             }
 
diff --git a/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst b/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
index 4563f65c2..4bb2ba159 100644
--- a/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
@@ -69,7 +69,7 @@ Computes the descriptors for a set of keypoints detected in an image (first vari
 
     :param keypoints: Input collection of keypoints. Keypoints for which a descriptor cannot be computed are removed. Sometimes new keypoints can be added, for example: ``SIFT`` duplicates keypoint with several dominant orientations (for each orientation).
 
-    :param descriptors: Computed descriptors. In the second variant of the method ``descriptors[i]`` are descriptors computed for a ``keypoints[i]`. Row ``j`` is the ``keypoints`` (or ``keypoints[i]``) is the descriptor for keypoint ``j``-th keypoint.
+    :param descriptors: Computed descriptors. In the second variant of the method ``descriptors[i]`` are descriptors computed for a ``keypoints[i]``. Row ``j`` is the ``keypoints`` (or ``keypoints[i]``) is the descriptor for keypoint ``j``-th keypoint.
 
 
 DescriptorExtractor::create
diff --git a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
index 295cc8381..daf6bcd82 100644
--- a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
@@ -249,7 +249,7 @@ Brute-force matcher constructor.
 
     :param normType: One of ``NORM_L1``, ``NORM_L2``, ``NORM_HAMMING``, ``NORM_HAMMING2``. ``L1`` and ``L2`` norms are preferable choices for SIFT and SURF descriptors, ``NORM_HAMMING`` should be used with ORB, BRISK and BRIEF, ``NORM_HAMMING2`` should be used with ORB when ``WTA_K==3`` or ``4`` (see ORB::ORB constructor description).
 
-    :param crossCheck: If it is false, this is will be default BFMatcher behaviour when it finds the k nearest neighbors for each query descriptor. If ``crossCheck==true``, then the ``knnMatch()`` method with ``k=1`` will only return pairs ``(i,j)`` such that for ``i-th`` query descriptor the ``j-th`` descriptor in the matcher's collection is the nearest and vice versa, i.e. the ``BFMathcher`` will only return consistent pairs. Such technique usually produces best results with minimal number of outliers when there are enough matches. This is alternative to the ratio test, used by D. Lowe in SIFT paper.
+    :param crossCheck: If it is false, this is will be default BFMatcher behaviour when it finds the k nearest neighbors for each query descriptor. If ``crossCheck==true``, then the ``knnMatch()`` method with ``k=1`` will only return pairs ``(i,j)`` such that for ``i-th`` query descriptor the ``j-th`` descriptor in the matcher's collection is the nearest and vice versa, i.e. the ``BFMatcher`` will only return consistent pairs. Such technique usually produces best results with minimal number of outliers when there are enough matches. This is alternative to the ratio test, used by D. Lowe in SIFT paper.
 
 
 FlannBasedMatcher
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index d1f4ee5bc..190e8ac66 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -616,14 +616,14 @@ protected:
 };
 
 
-class CV_EXPORTS DenseFeatureDetector : public FeatureDetector
+class CV_EXPORTS_W DenseFeatureDetector : public FeatureDetector
 {
 public:
-    explicit DenseFeatureDetector( float initFeatureScale=1.f, int featureScaleLevels=1,
-                                   float featureScaleMul=0.1f,
-                                   int initXyStep=6, int initImgBound=0,
-                                   bool varyXyStepWithScale=true,
-                                   bool varyImgBoundWithScale=false );
+    CV_WRAP explicit DenseFeatureDetector( float initFeatureScale=1.f, int featureScaleLevels=1,
+                                           float featureScaleMul=0.1f,
+                                           int initXyStep=6, int initImgBound=0,
+                                           bool varyXyStepWithScale=true,
+                                           bool varyImgBoundWithScale=false );
     AlgorithmInfo* info() const;
 
 protected:
diff --git a/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp b/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
index f7bd24cf5..2e6e57416 100644
--- a/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
+++ b/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
@@ -123,7 +123,7 @@ OCL_PERF_TEST_P(BruteForceMatcherFixture, RadiusMatch, ::testing::Combine(OCL_PE
     SANITY_CHECK_MATCHES(matches1, 1e-3);
 }
 
-}//ocl
-}//cvtest
+} // ocl
+} // cvtest
 
-#endif //HAVE_OPENCL
+#endif // HAVE_OPENCL
diff --git a/modules/features2d/perf/opencl/perf_fast.cpp b/modules/features2d/perf/opencl/perf_fast.cpp
new file mode 100644
index 000000000..7816da7b1
--- /dev/null
+++ b/modules/features2d/perf/opencl/perf_fast.cpp
@@ -0,0 +1,50 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+enum { TYPE_5_8 =FastFeatureDetector::TYPE_5_8, TYPE_7_12 = FastFeatureDetector::TYPE_7_12, TYPE_9_16 = FastFeatureDetector::TYPE_9_16 };
+CV_ENUM(FastType, TYPE_5_8, TYPE_7_12)
+
+typedef std::tr1::tuple<string, FastType> File_Type_t;
+typedef TestBaseWithParam<File_Type_t> FASTFixture;
+
+#define FAST_IMAGES \
+    "cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
+    "stitching/a3.png"
+
+OCL_PERF_TEST_P(FASTFixture, FastDetect, testing::Combine(
+                            testing::Values(FAST_IMAGES),
+                            FastType::all()
+                          ))
+{
+    string filename = getDataPath(get<0>(GetParam()));
+    int type = get<1>(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat frame;
+    mframe.copyTo(frame);
+    declare.in(frame);
+
+    Ptr<FeatureDetector> fd = Algorithm::create<FeatureDetector>("Feature2D.FAST");
+    ASSERT_FALSE( fd.empty() );
+    fd->set("threshold", 20);
+    fd->set("nonmaxSuppression", true);
+    fd->set("type", type);
+    vector<KeyPoint> points;
+
+    OCL_TEST_CYCLE() fd->detect(frame, points);
+
+    SANITY_CHECK_KEYPOINTS(points);
+}
+
+} // ocl
+} // cvtest
+
+#endif // HAVE_OPENCL
diff --git a/modules/features2d/perf/opencl/perf_orb.cpp b/modules/features2d/perf/opencl/perf_orb.cpp
new file mode 100644
index 000000000..f40b5f4b9
--- /dev/null
+++ b/modules/features2d/perf/opencl/perf_orb.cpp
@@ -0,0 +1,86 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef ::perf::TestBaseWithParam<std::string> ORBFixture;
+
+#define ORB_IMAGES OCL_PERF_ENUM("cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png", "stitching/a3.png")
+
+OCL_PERF_TEST_P(ORBFixture, ORB_Detect, ORB_IMAGES)
+{
+    string filename = getDataPath(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat frame, mask;
+    mframe.copyTo(frame);
+
+    declare.in(frame);
+    ORB detector(1500, 1.3f, 1);
+    vector<KeyPoint> points;
+
+    OCL_TEST_CYCLE() detector(frame, mask, points);
+
+    std::sort(points.begin(), points.end(), comparators::KeypointGreater());
+    SANITY_CHECK_KEYPOINTS(points, 1e-5);
+}
+
+OCL_PERF_TEST_P(ORBFixture, ORB_Extract, ORB_IMAGES)
+{
+    string filename = getDataPath(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat mask, frame;
+    mframe.copyTo(frame);
+
+    declare.in(frame);
+
+    ORB detector(1500, 1.3f, 1);
+    vector<KeyPoint> points;
+    detector(frame, mask, points);
+    std::sort(points.begin(), points.end(), comparators::KeypointGreater());
+
+    UMat descriptors;
+
+    OCL_TEST_CYCLE() detector(frame, mask, points, descriptors, true);
+
+    SANITY_CHECK(descriptors);
+}
+
+OCL_PERF_TEST_P(ORBFixture, ORB_Full, ORB_IMAGES)
+{
+    string filename = getDataPath(GetParam());
+    Mat mframe = imread(filename, IMREAD_GRAYSCALE);
+
+    if (mframe.empty())
+        FAIL() << "Unable to load source image " << filename;
+
+    UMat mask, frame;
+    mframe.copyTo(frame);
+
+    declare.in(frame);
+    ORB detector(1500, 1.3f, 1);
+
+    vector<KeyPoint> points;
+    UMat descriptors;
+
+    OCL_TEST_CYCLE() detector(frame, mask, points, descriptors, false);
+
+    ::perf::sort(points, descriptors);
+    SANITY_CHECK_KEYPOINTS(points, 1e-5);
+    SANITY_CHECK(descriptors);
+}
+
+} // ocl
+} // cvtest
+
+#endif // HAVE_OPENCL
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index c3b1fd5b8..dc1781cc5 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -758,10 +758,13 @@ private:
 
                     for (int k=0; k<indices_length; ++k) {
                         if (belongs_to[k]==j) {
-                            belongs_to[k] = i;
-                            count[j]--;
-                            count[i]++;
-                            break;
+                            // for cluster j, we move the furthest element from the center to the empty cluster i
+                            if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) {
+                                belongs_to[k] = i;
+                                count[j]--;
+                                count[i]++;
+                                break;
+                            }
                         }
                     }
                     converged = false;
diff --git a/modules/highgui/doc/reading_and_writing_images_and_video.rst b/modules/highgui/doc/reading_and_writing_images_and_video.rst
index 539b31e86..24613943d 100644
--- a/modules/highgui/doc/reading_and_writing_images_and_video.rst
+++ b/modules/highgui/doc/reading_and_writing_images_and_video.rst
@@ -483,7 +483,7 @@ VideoWriter constructors
 
     :param filename: Name of the output video file.
 
-    :param fourcc: 4-character code of codec used to compress the frames. For example, ``CV_FOURCC('P','I','M,'1')``  is a MPEG-1 codec, ``CV_FOURCC('M','J','P','G')``  is a motion-jpeg codec etc. List of codes can be obtained at `Video Codecs by FOURCC <http://www.fourcc.org/codecs.php>`_ page.
+    :param fourcc: 4-character code of codec used to compress the frames. For example, ``CV_FOURCC('P','I','M','1')``  is a MPEG-1 codec, ``CV_FOURCC('M','J','P','G')``  is a motion-jpeg codec etc. List of codes can be obtained at `Video Codecs by FOURCC <http://www.fourcc.org/codecs.php>`_ page.
 
     :param fps: Framerate of the created video stream.
 
diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp
index 601b872af..f05825f78 100644
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -215,12 +215,14 @@ enum { IMREAD_UNCHANGED  = -1, // 8bit, color or not
        IMREAD_ANYCOLOR   = 4   // ?, any color
      };
 
-enum { IMWRITE_JPEG_QUALITY    = 1,
-       IMWRITE_PNG_COMPRESSION = 16,
-       IMWRITE_PNG_STRATEGY    = 17,
-       IMWRITE_PNG_BILEVEL     = 18,
-       IMWRITE_PXM_BINARY      = 32,
-       IMWRITE_WEBP_QUALITY    = 64
+enum { IMWRITE_JPEG_QUALITY     = 1,
+       IMWRITE_JPEG_PROGRESSIVE = 2,
+       IMWRITE_JPEG_OPTIMIZE    = 3,
+       IMWRITE_PNG_COMPRESSION  = 16,
+       IMWRITE_PNG_STRATEGY     = 17,
+       IMWRITE_PNG_BILEVEL      = 18,
+       IMWRITE_PXM_BINARY       = 32,
+       IMWRITE_WEBP_QUALITY     = 64
      };
 
 enum { IMWRITE_PNG_STRATEGY_DEFAULT      = 0,
diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h
index ed8e2df0a..130302150 100644
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@@ -220,6 +220,8 @@ CVAPI(CvMat*) cvLoadImageM( const char* filename, int iscolor CV_DEFAULT(CV_LOAD
 enum
 {
     CV_IMWRITE_JPEG_QUALITY =1,
+    CV_IMWRITE_JPEG_PROGRESSIVE =2,
+    CV_IMWRITE_JPEG_OPTIMIZE =3,
     CV_IMWRITE_PNG_COMPRESSION =16,
     CV_IMWRITE_PNG_STRATEGY =17,
     CV_IMWRITE_PNG_BILEVEL =18,
@@ -463,6 +465,8 @@ enum
     CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_NEAR = 8006,
     CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_OPTIMAL = 8007,
     CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_FAR = 8008,
+    CV_CAP_PROP_ANDROID_EXPOSE_LOCK = 8009,
+    CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK = 8010,
 
     // Properties of cameras available through AVFOUNDATION interface
     CV_CAP_PROP_IOS_DEVICE_FOCUS = 9001,
@@ -543,6 +547,7 @@ enum
 enum
 {
     CV_CAP_ANDROID_FOCUS_MODE_AUTO = 0,
+    CV_CAP_ANDROID_FOCUS_MODE_CONTINUOUS_PICTURE,
     CV_CAP_ANDROID_FOCUS_MODE_CONTINUOUS_VIDEO,
     CV_CAP_ANDROID_FOCUS_MODE_EDOF,
     CV_CAP_ANDROID_FOCUS_MODE_FIXED,
diff --git a/modules/highgui/src/cap_android.cpp b/modules/highgui/src/cap_android.cpp
index 082a12f00..dac245d2b 100644
--- a/modules/highgui/src/cap_android.cpp
+++ b/modules/highgui/src/cap_android.cpp
@@ -289,6 +289,10 @@ double CvCapture_Android::getProperty( int propIdx )
         return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_OPTIMAL);
     case CV_CAP_PROP_ANDROID_FOCUS_DISTANCE_FAR:
         return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_FOCUS_DISTANCE_FAR);
+    case CV_CAP_PROP_ANDROID_EXPOSE_LOCK:
+        return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK);
+    case CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK:
+        return (double)m_activity->getProperty(ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK);
     default:
         CV_Error( CV_StsOutOfRange, "Failed attempt to GET unsupported camera property." );
         break;
@@ -327,14 +331,23 @@ bool CvCapture_Android::setProperty( int propIdx, double propValue )
         case CV_CAP_PROP_ANDROID_ANTIBANDING:
             m_activity->setProperty(ANDROID_CAMERA_PROPERTY_ANTIBANDING, propValue);
             break;
+        case CV_CAP_PROP_ANDROID_EXPOSE_LOCK:
+            m_activity->setProperty(ANDROID_CAMERA_PROPERTY_EXPOSE_LOCK, propValue);
+            break;
+        case CV_CAP_PROP_ANDROID_WHITEBALANCE_LOCK:
+            m_activity->setProperty(ANDROID_CAMERA_PROPERTY_WHITEBALANCE_LOCK, propValue);
+            break;
         default:
             CV_Error( CV_StsOutOfRange, "Failed attempt to SET unsupported camera property." );
             return false;
         }
 
-        if (propIdx != CV_CAP_PROP_AUTOGRAB) {// property for highgui class CvCapture_Android only
+        // Only changes in frame size require camera restart
+        if ((propIdx == CV_CAP_PROP_FRAME_WIDTH) || (propIdx == CV_CAP_PROP_FRAME_HEIGHT))
+        {   // property for highgui class CvCapture_Android only
             m_CameraParamsChanged = true;
         }
+
         res = true;
     }
 
diff --git a/modules/highgui/src/cap_avfoundation.mm b/modules/highgui/src/cap_avfoundation.mm
index 71777f875..e24ae3297 100644
--- a/modules/highgui/src/cap_avfoundation.mm
+++ b/modules/highgui/src/cap_avfoundation.mm
@@ -1309,6 +1309,8 @@ bool CvVideoWriter_AVFoundation::writeFrame(const IplImage* iplimage) {
     }
 
     //cleanup
+    CFRelease(cfData);
+    CVPixelBufferRelease(pixelBuffer);
     CGImageRelease(cgImage);
     CGDataProviderRelease(provider);
     CGColorSpaceRelease(colorSpace);
diff --git a/modules/highgui/src/grfmt_jpeg.cpp b/modules/highgui/src/grfmt_jpeg.cpp
index 28c52e859..147f185e4 100644
--- a/modules/highgui/src/grfmt_jpeg.cpp
+++ b/modules/highgui/src/grfmt_jpeg.cpp
@@ -598,6 +598,8 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
         cinfo.in_color_space = channels > 1 ? JCS_RGB : JCS_GRAYSCALE;
 
         int quality = 95;
+        int progressive = 0;
+        int optimize = 0;
 
         for( size_t i = 0; i < params.size(); i += 2 )
         {
@@ -606,11 +608,25 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
                 quality = params[i+1];
                 quality = MIN(MAX(quality, 0), 100);
             }
+
+            if( params[i] == CV_IMWRITE_JPEG_PROGRESSIVE )
+            {
+                progressive = params[i+1];
+            }
+
+            if( params[i] == CV_IMWRITE_JPEG_OPTIMIZE )
+            {
+                optimize = params[i+1];
+            }
         }
 
         jpeg_set_defaults( &cinfo );
         jpeg_set_quality( &cinfo, quality,
                           TRUE /* limit to baseline-JPEG values */ );
+        if( progressive )
+            jpeg_simple_progression( &cinfo );
+        if( optimize )
+            cinfo.optimize_coding = TRUE;
         jpeg_start_compress( &cinfo, TRUE );
 
         if( channels > 1 )
diff --git a/modules/highgui/test/test_grfmt.cpp b/modules/highgui/test/test_grfmt.cpp
index 11533e3ca..2f7640629 100644
--- a/modules/highgui/test/test_grfmt.cpp
+++ b/modules/highgui/test/test_grfmt.cpp
@@ -386,6 +386,54 @@ TEST(Highgui_Jpeg, encode_empty)
 
     ASSERT_THROW(cv::imencode(".jpg", img, jpegImg), cv::Exception);
 }
+
+TEST(Highgui_Jpeg, encode_decode_progressive_jpeg)
+{
+    cvtest::TS& ts = *cvtest::TS::ptr();
+    string input = string(ts.get_data_path()) + "../cv/shared/lena.png";
+    cv::Mat img = cv::imread(input);
+    ASSERT_FALSE(img.empty());
+
+    std::vector<int> params;
+    params.push_back(IMWRITE_JPEG_PROGRESSIVE);
+    params.push_back(1);
+
+    string output_progressive = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_progressive, img, params));
+    cv::Mat img_jpg_progressive = cv::imread(output_progressive);
+
+    string output_normal = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_normal, img));
+    cv::Mat img_jpg_normal = cv::imread(output_normal);
+
+    EXPECT_EQ(0, cv::norm(img_jpg_progressive, img_jpg_normal, NORM_INF));
+
+    remove(output_progressive.c_str());
+}
+
+TEST(Highgui_Jpeg, encode_decode_optimize_jpeg)
+{
+    cvtest::TS& ts = *cvtest::TS::ptr();
+    string input = string(ts.get_data_path()) + "../cv/shared/lena.png";
+    cv::Mat img = cv::imread(input);
+    ASSERT_FALSE(img.empty());
+
+    std::vector<int> params;
+    params.push_back(IMWRITE_JPEG_OPTIMIZE);
+    params.push_back(1);
+
+    string output_optimized = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_optimized, img, params));
+    cv::Mat img_jpg_optimized = cv::imread(output_optimized);
+
+    string output_normal = cv::tempfile(".jpg");
+    EXPECT_NO_THROW(cv::imwrite(output_normal, img));
+    cv::Mat img_jpg_normal = cv::imread(output_normal);
+
+    EXPECT_EQ(0, cv::norm(img_jpg_optimized, img_jpg_normal, NORM_INF));
+
+    remove(output_optimized.c_str());
+}
 #endif
 
 
diff --git a/modules/imgproc/perf/opencl/perf_filters.cpp b/modules/imgproc/perf/opencl/perf_filters.cpp
index 57b928c28..f7329e319 100644
--- a/modules/imgproc/perf/opencl/perf_filters.cpp
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@@ -211,7 +211,7 @@ OCL_PERF_TEST_P(SobelFixture, Sobel,
 
     OCL_TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);
 
-    SANITY_CHECK(dst);
+    SANITY_CHECK(dst, 1e-6);
 }
 
 ///////////// Scharr ////////////////////////
diff --git a/modules/imgproc/perf/opencl/perf_imgproc.cpp b/modules/imgproc/perf/opencl/perf_imgproc.cpp
index 0d63e940e..71449872f 100644
--- a/modules/imgproc/perf/opencl/perf_imgproc.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgproc.cpp
@@ -95,6 +95,34 @@ OCL_PERF_TEST_P(CalcHistFixture, CalcHist, OCL_TEST_SIZES)
     SANITY_CHECK(hist);
 }
 
+///////////// calcHist ////////////////////////
+
+typedef TestBaseWithParam<Size> CalcBackProjFixture;
+
+OCL_PERF_TEST_P(CalcBackProjFixture, CalcBackProj, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    const std::vector<int> channels(1, 0);
+    std::vector<float> ranges(2);
+    std::vector<int> histSize(1, 256);
+    ranges[0] = 0;
+    ranges[1] = 256;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), hist(256, 1, CV_32FC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(hist);
+
+    cv::calcHist(std::vector<UMat>(1, src), channels, noArray(), hist, histSize, ranges, false);
+
+    declare.in(src, WARMUP_RNG).out(dst);
+    OCL_TEST_CYCLE() cv::calcBackProject(std::vector<UMat>(1,src), channels, hist, dst, ranges, 1);
+
+    SANITY_CHECK_NOTHING();
+}
+
+
 /////////// CopyMakeBorder //////////////////////
 
 CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101)
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index 766cb309c..fbc92dde2 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -100,19 +100,29 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
         low_thresh = std::min(32767.0f, low_thresh);
         high_thresh = std::min(32767.0f, high_thresh);
 
-        if (low_thresh > 0) low_thresh *= low_thresh;
-        if (high_thresh > 0) high_thresh *= high_thresh;
+        if (low_thresh > 0)
+            low_thresh *= low_thresh;
+        if (high_thresh > 0)
+            high_thresh *= high_thresh;
     }
     int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
     Size esize(size.width + 2, size.height + 2);
 
     UMat mag;
-    size_t globalsize[2] = { size.width * cn, size.height }, localsize[2] = { 16, 16 };
+    size_t globalsize[2] = { size.width, size.height }, localsize[2] = { 16, 16 };
 
     if (aperture_size == 3 && !_src.isSubmatrix())
     {
         // Sobel calculation
-        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc);
+        char cvt[2][40];
+        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc,
+                                           format("-D OP_SOBEL -D cn=%d -D shortT=%s -D ucharT=%s"
+                                                  " -D convertToIntT=%s -D intT=%s -D convertToShortT=%s", cn,
+                                                  ocl::typeToStr(CV_16SC(cn)),
+                                                  ocl::typeToStr(CV_8UC(cn)),
+                                                  ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
+                                                  ocl::typeToStr(CV_32SC(cn)),
+                                                  ocl::convertTypeStr(CV_32S, CV_16S, cn, cvt[1])));
         if (calcSobelRowPassKernel.empty())
             return false;
 
@@ -126,58 +136,62 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
 
         // magnitude calculation
         ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc,
-                                    L2gradient ? " -D L2GRAD" : "");
+                                    format("-D cn=%d%s -D OP_MAG_BUF -D shortT=%s -D convertToIntT=%s -D intT=%s",
+                                           cn, L2gradient ? " -D L2GRAD" : "",
+                                           ocl::typeToStr(CV_16SC(cn)),
+                                           ocl::convertTypeStr(CV_16S, CV_32S, cn, cvt[0]),
+                                           ocl::typeToStr(CV_32SC(cn))));
         if (magnitudeKernel.empty())
             return false;
 
-        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        mag = UMat(esize, CV_32SC1, Scalar::all(0));
         dx.create(size, CV_16SC(cn));
         dy.create(size, CV_16SC(cn));
 
         magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf),
                              ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy),
-                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+                             ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);
 
         if (!magnitudeKernel.run(2, globalsize, localsize, false))
             return false;
     }
     else
     {
-        dx.create(size, CV_16SC(cn));
-        dy.create(size, CV_16SC(cn));
-
-        Sobel(_src, dx, CV_16SC1, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
-        Sobel(_src, dy, CV_16SC1, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
+        Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+        Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
 
         // magnitude calculation
         ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc,
-                                    L2gradient ? " -D L2GRAD" : "");
+                                    format("-D OP_MAG -D cn=%d%s -D intT=int -D shortT=short -D convertToIntT=convert_int_sat",
+                                           cn, L2gradient ? " -D L2GRAD" : ""));
         if (magnitudeKernel.empty())
             return false;
 
-        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        mag = UMat(esize, CV_32SC1, Scalar::all(0));
         magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
-                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+                             ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);
 
         if (!magnitudeKernel.run(2, globalsize, NULL, false))
             return false;
     }
 
     // map calculation
-    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc);
+    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc,
+                              format("-D OP_MAP -D cn=%d", cn));
     if (calcMapKernel.empty())
         return false;
 
-    UMat map(esize, CV_32SC(cn));
+    UMat map(esize, CV_32SC1);
     calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
-                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map, cn),
+                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map),
                        size.height, size.width, low, high);
 
     if (!calcMapKernel.run(2, globalsize, localsize, false))
         return false;
 
     // local hysteresis thresholding
-    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc);
+    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc,
+                                           "-D OP_HYST_LOCAL");
     if (edgesHysteresisLocalKernel.empty())
         return false;
 
@@ -193,7 +207,8 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
 
     for ( ; ; )
     {
-        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc);
+        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc,
+                                                "-D OP_HYST_GLOBAL");
         if (edgesHysteresisGlobalKernel.empty())
             return false;
 
@@ -221,14 +236,15 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
     }
 
     // get edges
-    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc);
+    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, "-D OP_EDGES");
     if (getEdgesKernel.empty())
         return false;
 
-    _dst.create(size, CV_8UC(cn));
+    _dst.create(size, CV_8UC1);
     UMat dst = _dst.getUMat();
 
     getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst));
+
     return getEdgesKernel.run(2, globalsize, NULL, false);
 }
 
@@ -254,12 +270,12 @@ void cv::Canny( InputArray _src, OutputArray _dst,
     }
 
     if ((aperture_size & 1) == 0 || (aperture_size != -1 && (aperture_size < 3 || aperture_size > 7)))
-        CV_Error(CV_StsBadFlag, "");
+        CV_Error(CV_StsBadFlag, "Aperture size should be odd");
 
     if (low_thresh > high_thresh)
         std::swap(low_thresh, high_thresh);
 
-    CV_OCL_RUN(_dst.isUMat() && cn == 1,
+    CV_OCL_RUN(_dst.isUMat() && (cn == 1 || cn == 3),
                ocl_Canny(_src, _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))
 
     Mat src = _src.getMat(), dst = _dst.getMat();
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 31a8b1b93..df2f371f5 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -11,6 +11,7 @@
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -40,6 +41,8 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 static IppStatus sts = ippInit();
 #endif
@@ -495,6 +498,58 @@ void cv::Scharr( InputArray _src, OutputArray _dst, int ddepth, int dx, int dy,
     sepFilter2D( _src, _dst, ddepth, kx, ky, Point(-1, -1), delta, borderType );
 }
 
+#ifdef HAVE_OPENCL
+
+namespace cv {
+
+static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
+                           const Mat & kd, const Mat & ks, double scale, double delta,
+                           int borderType, int depth, int ddepth)
+{
+    int iscale = cvRound(scale), idelta = cvRound(delta);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            floatCoeff = std::fabs(delta - idelta) > DBL_EPSILON || std::fabs(scale - iscale) > DBL_EPSILON;
+    int cn = _src.channels(), wdepth = std::max(depth, floatCoeff ? CV_32F : CV_32S), kercn = 1;
+
+    if (!doubleSupport && wdepth == CV_64F)
+        return false;
+
+    char cvt[2][40];
+    ocl::Kernel k("sumConvert", ocl::imgproc::laplacian5_oclsrc,
+                  format("-D srcT=%s -D WT=%s -D dstT=%s -D coeffT=%s -D wdepth=%d "
+                         "-D convertToWT=%s -D convertToDT=%s%s",
+                         ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
+                         ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
+                         ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
+                         ocl::typeToStr(wdepth), wdepth,
+                         ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
+                         ocl::convertTypeStr(wdepth, ddepth, kercn, cvt[1]),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat d2x, d2y;
+    sepFilter2D(_src, d2x, depth, kd, ks, Point(-1, -1), 0, borderType);
+    sepFilter2D(_src, d2y, depth, ks, kd, Point(-1, -1), 0, borderType);
+
+    UMat dst = _dst.getUMat();
+
+    ocl::KernelArg d2xarg = ocl::KernelArg::ReadOnlyNoSize(d2x),
+            d2yarg = ocl::KernelArg::ReadOnlyNoSize(d2y),
+            dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
+
+    if (wdepth >= CV_32F)
+        k.args(d2xarg, d2yarg, dstarg, (float)scale, (float)delta);
+    else
+        k.args(d2xarg, d2yarg, dstarg, iscale, idelta);
+
+    size_t globalsize[] = { dst.cols * cn / kercn, dst.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+}
+
+#endif
 
 void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
                     double scale, double delta, int borderType )
@@ -531,27 +586,28 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
     }
     else
     {
-        Mat src = _src.getMat(), dst = _dst.getMat();
-        const size_t STRIPE_SIZE = 1 << 14;
-
-        int depth = src.depth();
-        int ktype = std::max(CV_32F, std::max(ddepth, depth));
-        int wdepth = depth == CV_8U && ksize <= 5 ? CV_16S : depth <= CV_32F ? CV_32F : CV_64F;
-        int wtype = CV_MAKETYPE(wdepth, src.channels());
+        int ktype = std::max(CV_32F, std::max(ddepth, sdepth));
+        int wdepth = sdepth == CV_8U && ksize <= 5 ? CV_16S : sdepth <= CV_32F ? CV_32F : CV_64F;
+        int wtype = CV_MAKETYPE(wdepth, cn);
         Mat kd, ks;
         getSobelKernels( kd, ks, 2, 0, ksize, false, ktype );
-        int dtype = CV_MAKETYPE(ddepth, src.channels());
 
-        int dy0 = std::min(std::max((int)(STRIPE_SIZE/(getElemSize(src.type())*src.cols)), 1), src.rows);
-        Ptr<FilterEngine> fx = createSeparableLinearFilter(src.type(),
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_Laplacian5(_src, _dst, kd, ks, scale,
+                                  delta, borderType, wdepth, ddepth))
+
+        const size_t STRIPE_SIZE = 1 << 14;
+        Ptr<FilterEngine> fx = createSeparableLinearFilter(stype,
             wtype, kd, ks, Point(-1,-1), 0, borderType, borderType, Scalar() );
-        Ptr<FilterEngine> fy = createSeparableLinearFilter(src.type(),
+        Ptr<FilterEngine> fy = createSeparableLinearFilter(stype,
             wtype, ks, kd, Point(-1,-1), 0, borderType, borderType, Scalar() );
 
+        Mat src = _src.getMat(), dst = _dst.getMat();
         int y = fx->start(src), dsty = 0, dy = 0;
         fy->start(src);
         const uchar* sptr = src.data + y*src.step;
 
+        int dy0 = std::min(std::max((int)(STRIPE_SIZE/(CV_ELEM_SIZE(stype)*src.cols)), 1), src.rows);
         Mat d2x( dy0 + kd.rows - 1, src.cols, wtype );
         Mat d2y( dy0 + kd.rows - 1, src.cols, wtype );
 
@@ -564,7 +620,7 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
                 Mat dstripe = dst.rowRange(dsty, dsty + dy);
                 d2x.rows = d2y.rows = dy; // modify the headers, which should work
                 d2x += d2y;
-                d2x.convertTo( dstripe, dtype, scale, delta );
+                d2x.convertTo( dstripe, ddepth, scale, delta );
             }
         }
     }
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index ea0baf6b0..2bc6b8a70 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -42,7 +42,6 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels.hpp"
-#include <sstream>
 
 /****************************************************************************************\
                                     Base Image Filter
@@ -3134,7 +3133,7 @@ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFi
 // b e h b e h 0 0
 // c f i c f i 0 0
 template <typename T>
-static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
+static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
 {
     Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
     int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
@@ -3154,75 +3153,52 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
                    InputArray _kernel, Point anchor,
                    double delta, int borderType )
 {
-    if (abs(delta) > FLT_MIN)
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    ddepth = ddepth < 0 ? sdepth : ddepth;
+    int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
+            wtype = CV_MAKE_TYPE(wdepth, cn);
+    if (cn > 4)
         return false;
 
-    int type = _src.type();
-    int cn = CV_MAT_CN(type);
-    if ((1 != cn) && (2 != cn) && (4 != cn))
-        return false;//TODO
-
-    int sdepth = CV_MAT_DEPTH(type);
     Size ksize = _kernel.size();
-    if( anchor.x < 0 )
+    if (anchor.x < 0)
         anchor.x = ksize.width / 2;
-    if( anchor.y < 0 )
+    if (anchor.y < 0)
         anchor.y = ksize.height / 2;
-    if( ddepth < 0 )
-        ddepth = sdepth;
-    else if (ddepth != sdepth)
-        return false;
 
-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    bool useDouble = (CV_64F == sdepth);
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    borderType &= ~BORDER_ISOLATED;
     const cv::ocl::Device &device = cv::ocl::Device::getDefault();
-    int doubleFPConfig = device.doubleFPConfig();
-    if (useDouble && (0 == doubleFPConfig))
+    bool doubleSupport = device.doubleFPConfig() > 0;
+    if (wdepth == CV_64F && !doubleSupport)
         return false;
 
-    const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
-    {
-    case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
-        break;
-    case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
-        break;
-    case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
-        break;
-    case BORDER_WRAP:
-        return false;
-    case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
-        break;
-    }
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
+                                       "BORDER_WRAP", "BORDER_REFLECT_101" };
 
     cv::Mat kernelMat = _kernel.getMat();
     std::vector<float> kernelMatDataFloat;
-    std::vector<double> kernelMatDataDouble;
-    int kernel_size_y2_aligned = useDouble ?
-            _prepareKernelFilter2D<double>(kernelMatDataDouble, kernelMat)
-            : _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
+    int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
 
+    cv::Size sz = _src.size(), wholeSize;
+    size_t globalsize[2] = { sz.width, sz.height }, localsize[2] = { 0, 1 };
 
-    cv::Size sz = _src.size();
-    size_t globalsize[2] = {sz.width, sz.height};
-    size_t localsize[2] = {0, 1};
-
-    ocl::Kernel kernel;
-    UMat src; Size wholeSize;
-    if (!isIsolatedBorder)
+    ocl::Kernel k;
+    UMat src = _src.getUMat();
+    if (!isolated)
     {
-        src = _src.getUMat();
         Point ofs;
         src.locateROI(wholeSize, ofs);
     }
 
-    size_t maxWorkItemSizes[32]; device.maxWorkItemSizes(maxWorkItemSizes);
+    size_t maxWorkItemSizes[32];
+    device.maxWorkItemSizes(maxWorkItemSizes);
     size_t tryWorkItems = maxWorkItemSizes[0];
-    for (;;)
+    char cvt[2][40];
+
+    String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
+
+    for ( ; ; )
     {
         size_t BLOCK_SIZE = tryWorkItems;
         while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
@@ -3242,32 +3218,36 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
         int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
         int requiredBottom = ksize.height - 1 - anchor.y;
         int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        int h = isIsolatedBorder ? sz.height : wholeSize.height;
-        int w = isIsolatedBorder ? sz.width : wholeSize.width;
+        int h = isolated ? sz.height : wholeSize.height;
+        int w = isolated ? sz.width : wholeSize.width;
         bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
 
         if ((w < ksize.width) || (h < ksize.height))
             return false;
 
-        char build_options[1024];
-        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
-                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
-                "-D %s -D %s -D %s",
-                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-                sdepth, cn, useDouble ? 1 : 0,
-                anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
-                btype,
-                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+        String opts = format("-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D cn=%d "
+                             "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
+                             "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
+                             "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
+                             "-D convertToWT=%s -D convertToDstT=%s",
+                             (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, cn, anchor.x, anchor.y,
+                             ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType],
+                             extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                             isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+                             doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(),
+                             ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
+                             ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]));
 
         localsize[0] = BLOCK_SIZE;
         globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
         globalsize[1] = DIVUP(sz.height, BLOCK_SIZE_Y);
 
-        cv::String errmsg;
-        if (!kernel.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, build_options))
+        if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts))
             return false;
-        size_t kernelWorkGroupSize = kernel.workGroupSize();
+
+        size_t kernelWorkGroupSize = k.workGroupSize();
         if (localsize[0] <= kernelWorkGroupSize)
             break;
         if (BLOCK_SIZE < kernelWorkGroupSize)
@@ -3275,242 +3255,238 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
         tryWorkItems = kernelWorkGroupSize;
     }
 
-    _dst.create(sz, CV_MAKETYPE(ddepth, cn));
+    _dst.create(sz, dtype);
     UMat dst = _dst.getUMat();
-    if (src.empty())
-        src = _src.getUMat();
-
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernel.set(idxArg, (int)src.step);
 
     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
     int srcOffsetY = (int)(src.offset / src.step);
-    int srcEndX = (isIsolatedBorder ? (srcOffsetX + sz.width) : wholeSize.width);
-    int srcEndY = (isIsolatedBorder ? (srcOffsetY + sz.height) : wholeSize.height);
-    idxArg = kernel.set(idxArg, srcOffsetX);
-    idxArg = kernel.set(idxArg, srcOffsetY);
-    idxArg = kernel.set(idxArg, srcEndX);
-    idxArg = kernel.set(idxArg, srcEndY);
+    int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width);
+    int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height);
 
-    idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
-    float borderValue[4] = {0, 0, 0, 0};
-    double borderValueDouble[4] = {0, 0, 0, 0};
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-    {
-        int cnocl = (3 == cn) ? 4 : cn;
-        if (useDouble)
-            idxArg = kernel.set(idxArg, (void *)&borderValueDouble[0], sizeof(double) * cnocl);
-        else
-            idxArg = kernel.set(idxArg, (void *)&borderValue[0], sizeof(float) * cnocl);
-    }
-    if (useDouble)
-    {
-        UMat kernalDataUMat(kernelMatDataDouble, true);
-        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(kernalDataUMat));
-    }
-    else
-    {
-        UMat kernalDataUMat(kernelMatDataFloat, true);
-        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(kernalDataUMat));
-    }
-    return kernel.run(2, globalsize, localsize, true);
+    k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY,
+           srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta);
+
+    return k.run(2, globalsize, localsize, false);
 }
 
-static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync)
+static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
+                               int borderType, int ddepth, bool fast8uc1)
 {
-    int type = src.type();
-    int cn = CV_MAT_CN(type);
-    int sdepth = CV_MAT_DEPTH(type);
+    int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
     Size bufSize = buf.size();
 
+    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        return false;
+
 #ifdef ANDROID
     size_t localsize[2] = {16, 10};
 #else
     size_t localsize[2] = {16, 16};
 #endif
+
     size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
-    if (CV_8U == sdepth)
-    {
-        switch (cn)
-        {
-        case 1:
-            globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
-            break;
-        case 2:
-            globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0];
-            break;
-        case 4:
-            globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0];
-            break;
-        }
-    }
+    if (fast8uc1)
+        globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
 
-    int radiusX = anchor;
-    int radiusY = (int)((buf.rows - src.rows) >> 1);
+    int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
 
-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
-    {
-    case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
-        break;
-    case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
-        break;
-    case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
-        break;
-    case BORDER_WRAP:
-        btype = "BORDER_WRAP";
-        break;
-    case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
-        break;
-    default:
-        return false;
-    }
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
+        * const btype = borderMap[borderType & ~BORDER_ISOLATED];
 
     bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
     extra_extrapolation |= src.rows < radiusY;
     extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
     extra_extrapolation |= src.cols < radiusX;
 
-    cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s",
-        radiusX, (int)localsize[0], (int)localsize[1], cn,
-        btype,
-        extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-        isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+    char cvt[40];
+    cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
+                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
+                                          radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
+                                          extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+                                          isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
+                                          ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
+                                          ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
+                                          ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F),
+                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
     build_options += ocl::kernelToStr(kernelX, CV_32F);
 
     Size srcWholeSize; Point srcOffset;
     src.locateROI(srcWholeSize, srcOffset);
 
-    std::stringstream strKernel;
-    strKernel << "row_filter";
-    if (-1 != cn)
-        strKernel << "_C" << cn;
-    if (-1 != sdepth)
-        strKernel << "_D" << sdepth;
+    String kernelName("row_filter");
+    if (fast8uc1)
+        kernelName += "_C1_D0";
 
-    ocl::Kernel kernelRow;
-    if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
-                          build_options))
+    ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
+                  build_options);
+    if (k.empty())
         return false;
 
-    int idxArg = 0;
-    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
-    idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize()));
+    if (fast8uc1)
+        k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
+               srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
+               ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
+               buf.cols, buf.rows, radiusY);
+    else
+        k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
+               srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
+               ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
 
-    idxArg = kernelRow.set(idxArg, srcOffset.x);
-    idxArg = kernelRow.set(idxArg, srcOffset.y);
-    idxArg = kernelRow.set(idxArg, src.cols);
-    idxArg = kernelRow.set(idxArg, src.rows);
-    idxArg = kernelRow.set(idxArg, srcWholeSize.width);
-    idxArg = kernelRow.set(idxArg, srcWholeSize.height);
-
-    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf));
-    idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize()));
-    idxArg = kernelRow.set(idxArg, buf.cols);
-    idxArg = kernelRow.set(idxArg, buf.rows);
-    idxArg = kernelRow.set(idxArg, radiusY);
-
-    return kernelRow.run(2, globalsize, localsize, sync);
+    return k.run(2, globalsize, localsize, false);
 }
 
-static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync)
+static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor)
 {
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    if (dst.depth() == CV_64F && !doubleSupport)
+        return false;
+
 #ifdef ANDROID
-    size_t localsize[2] = {16, 10};
+    size_t localsize[2] = { 16, 10 };
 #else
-    size_t localsize[2] = {16, 16};
+    size_t localsize[2] = { 16, 16 };
 #endif
-    size_t globalsize[2] = {0, 0};
+    size_t globalsize[2] = { 0, 0 };
 
     int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
     Size sz = dst.size();
 
     globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
-
-    if (dtype == CV_8UC2)
-        globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0];
-    else
-        globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
+    globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
 
     char cvt[40];
-    cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()),
-                                          ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt));
+    cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
+                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
+                                          " -D srcT1=%s -D dstT1=%s%s",
+                                          anchor, (int)localsize[0], (int)localsize[1], cn,
+                                          ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
+                                          ocl::convertTypeStr(CV_32F, ddepth, cn, cvt),
+                                          ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth),
+                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
     build_options += ocl::kernelToStr(kernelY, CV_32F);
 
-    ocl::Kernel kernelCol;
-    if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options))
+    ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
+                  build_options);
+    if (k.empty())
         return false;
 
-    int idxArg = 0;
-    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf));
-    idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize()));
-    idxArg = kernelCol.set(idxArg, buf.cols);
-    idxArg = kernelCol.set(idxArg, buf.rows);
+    k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
+           static_cast<float>(delta));
 
-    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
-    idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize()));
-    idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize()));
-    idxArg = kernelCol.set(idxArg, dst.cols);
-    idxArg = kernelCol.set(idxArg, dst.rows);
+    return k.run(2, globalsize, localsize, false);
+}
 
-    return kernelCol.run(2, globalsize, localsize, sync);
+const int optimizedSepFilterLocalSize = 16;
+
+static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
+                                       Mat row_kernel, Mat col_kernel,
+                                       double delta, int borderType, int ddepth)
+{
+    Size size = _src.size(), wholeSize;
+    Point origin;
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
+            esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
+            dtype = CV_MAKE_TYPE(ddepth, cn);
+    size_t src_step = _src.step(), src_offset = _src.offset();
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+            !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
+              borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
+              borderType == BORDER_REFLECT_101))
+        return false;
+
+    size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize };
+    size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) };
+
+    char cvt[2][40];
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
+                                       "BORDER_REFLECT_101" };
+
+    String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
+                             " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
+                             " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1],
+                             row_kernel.cols / 2, col_kernel.cols / 2,
+                             ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
+                             ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
+                             ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
+                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
+
+    ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    _dst.create(size, dtype);
+    UMat dst = _dst.getUMat();
+
+    int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
+    int src_offset_y = static_cast<int>(src_offset / src_step);
+
+    src.locateROI(wholeSize, origin);
+
+    k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
+           wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
+           static_cast<float>(delta));
+
+    return k.run(2, gt2, lt2, false);
 }
 
 static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
                       InputArray _kernelX, InputArray _kernelY, Point anchor,
                       double delta, int borderType )
 {
-    if (abs(delta)> FLT_MIN)
-        return false;
+    const ocl::Device & d = ocl::Device::getDefault();
+    Size imgSize = _src.size();
 
-    int type = _src.type();
-    if ( !( (type == CV_8UC1 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC4) &&
-            (ddepth == CV_32F || ddepth == CV_16S || ddepth == CV_8U || ddepth < 0) ) )
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    if (cn > 4)
         return false;
 
-    int cn = CV_MAT_CN(type);
-
     Mat kernelX = _kernelX.getMat().reshape(1, 1);
-    if (1 != (kernelX.cols % 2))
+    if (kernelX.cols % 2 != 1)
         return false;
     Mat kernelY = _kernelY.getMat().reshape(1, 1);
-    if (1 != (kernelY.cols % 2))
+    if (kernelY.cols % 2 != 1)
         return false;
 
-    int sdepth = CV_MAT_DEPTH(type);
-    if( anchor.x < 0 )
-        anchor.x = kernelX.cols >> 1;
-    if( anchor.y < 0 )
-        anchor.y = kernelY.cols >> 1;
-
-    if( ddepth < 0 )
+    if (ddepth < 0)
         ddepth = sdepth;
 
+    CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
+                imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) &&
+                imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) &&
+                (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) &&
+                (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
+                ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
+                                           borderType & ~BORDER_ISOLATED, ddepth), true)
+
+    if (anchor.x < 0)
+        anchor.x = kernelX.cols >> 1;
+    if (anchor.y < 0)
+        anchor.y = kernelY.cols >> 1;
+
     UMat src = _src.getUMat();
     Size srcWholeSize; Point srcOffset;
     src.locateROI(srcWholeSize, srcOffset);
-    if ( (0 != (srcOffset.x % 4))   ||
-         (0 != (src.cols % 4))      ||
-         (0 != ((src.step / src.elemSize()) % 4))
-       )
-        return false;
+
+    bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
+            src.cols % 4 == 0 && src.step % 4 == 0;
 
     Size srcSize = src.size();
     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
-    UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
-    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false))
+    UMat buf(bufSize, CV_32FC(cn));
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
         return false;
 
     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
     UMat dst = _dst.getUMat();
-    return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false);
+
+    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y);
 }
 
 #endif
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 51ee5bc0d..da392b055 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1917,71 +1917,73 @@ class IPPresizeInvoker :
     public ParallelLoopBody
 {
 public:
-    IPPresizeInvoker(Mat &_src, Mat &_dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
-      ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
-      {
-          *ok = true;
-          IppiSize srcSize, dstSize;
-          int type = src.type();
-          int specSize = 0, initSize = 0;
-          srcSize.width  = src.cols;
-          srcSize.height = src.rows;
-          dstSize.width  = dst.cols;
-          dstSize.height = dst.rows;
+    IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
+        ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
+    {
+        *ok = true;
+        IppiSize srcSize, dstSize;
+        int type = src.type();
+        int specSize = 0, initSize = 0;
+        srcSize.width  = src.cols;
+        srcSize.height = src.rows;
+        dstSize.width  = dst.cols;
+        dstSize.height = dst.rows;
 
-          switch (type)
-          {
-          case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
-          case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
-          case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
-          case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
-          case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
-          case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
-          case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
-          case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
-          case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
-          case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
-          case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
-          case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
-          case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
-          case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
-          case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
-          default: { *ok = false; return;} break;
-          }
-      }
+        switch (type)
+        {
+            case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
+            case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
+            case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
+            case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
+            case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
+            case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
+            case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
+            case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
+            case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
+            case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
+            case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
+            case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
+            case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
+            case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
+            case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
+            default: { *ok = false; return; } break;
+        }
+    }
 
-      ~IPPresizeInvoker()
-      {
-      }
+    ~IPPresizeInvoker()
+    {
+    }
 
-      virtual void operator() (const Range& range) const
-      {
-          if (*ok == false) return;
+    virtual void operator() (const Range& range) const
+    {
+        if (*ok == false)
+          return;
 
-          int cn = src.channels();
-          int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
-          int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
-          int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
+        int cn = src.channels();
+        int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
+        int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
+        int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
 
-          IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
-          IppiSize  dstSize   = { dstwidth, dstheight - dsty };
-          int bufsize = 0, itemSize = (int)src.elemSize1();
+        IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
+        IppiSize  dstSize   = { dstwidth, dstheight - dsty };
+        int bufsize = 0, itemSize = (int)src.elemSize1();
 
-          CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
-          CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
+        CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
+        CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
 
-          Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
-          Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
+        Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
+        Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
 
-          AutoBuffer<uchar> buf(bufsize + 64);
-          uchar* bufptr = alignPtr((uchar*)buf, 32);
+        AutoBuffer<uchar> buf(bufsize + 64);
+        uchar* bufptr = alignPtr((uchar*)buf, 32);
+
+        if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
+            *ok = false;
+    }
 
-          if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
-              *ok = false;
-      }
 private:
-    Mat &src;
-    Mat &dst;
+    Mat & src;
+    Mat & dst;
     double inv_scale_x;
     double inv_scale_y;
     void *pSpec;
@@ -1993,12 +1995,13 @@ private:
     bool *ok;
     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
 };
+
 #endif
 
 #ifdef HAVE_OPENCL
 
 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
-                                          float * const alpha_tab, int * const ofs_tab)
+                                      float * const alpha_tab, int * const ofs_tab)
 {
     int k = 0, dx = 0;
     for ( ; dx < dsize; dx++)
@@ -2049,8 +2052,16 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
-    double inv_fx = 1. / fx, inv_fy = 1. / fy;
+    double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
+    int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
+    bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
+        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
+
+    // in case of scale_x && scale_y is equal to 2
+    // INTER_AREA (fast) also is equal to INTER_LINEAR
+    if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
+        /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
 
     if( !(cn <= 4 &&
            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
@@ -2061,39 +2072,105 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     _dst.create(dsize, type);
     UMat dst = _dst.getUMat();
 
+    Size ssize = src.size();
     ocl::Kernel k;
     size_t globalsize[] = { dst.cols, dst.rows };
 
     if (interpolation == INTER_LINEAR)
     {
-        int wdepth = std::max(depth, CV_32S);
-        int wtype = CV_MAKETYPE(wdepth, cn);
         char buf[2][32];
-        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
-                        "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
-                        depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
-                        ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
-                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
-                        cn));
+
+        // integer path is slower because of CPU part, so it's disabled
+        if (depth == CV_8U && ((void)0, 0))
+        {
+            AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
+            int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
+            short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
+            float fxx, fyy;
+            int sx, sy;
+
+            for (int dx = 0; dx < dsize.width; dx++)
+            {
+                fxx = (float)((dx+0.5)*inv_fx - 0.5);
+                sx = cvFloor(fxx);
+                fxx -= sx;
+
+                if (sx < 0)
+                    fxx = 0, sx = 0;
+
+                if (sx >= ssize.width-1)
+                    fxx = 0, sx = ssize.width-1;
+
+                xofs[dx] = sx;
+                ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
+                ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
+            }
+
+            for (int dy = 0; dy < dsize.height; dy++)
+            {
+                fyy = (float)((dy+0.5)*inv_fy - 0.5);
+                sy = cvFloor(fyy);
+                fyy -= sy;
+
+                yofs[dy] = sy;
+                ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
+                ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
+            }
+
+            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+            UMat coeffs;
+            Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
+
+            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                     format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                            "-D INTER_RESIZE_COEF_BITS=%d",
+                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                            cn, INTER_RESIZE_COEF_BITS));
+            if (k.empty())
+                return false;
+
+            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+                   ocl::KernelArg::PtrReadOnly(coeffs));
+        }
+        else
+        {
+            int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+            k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                     format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
+                            "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+                            "-D INTER_RESIZE_COEF_BITS=%d",
+                            depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                            ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                            ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                            cn, INTER_RESIZE_COEF_BITS));
+            if (k.empty())
+                return false;
+
+            k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+                   (float)inv_fx, (float)inv_fy);
+        }
     }
     else if (interpolation == INTER_NEAREST)
     {
         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
+                 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
                         ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
+        if (k.empty())
+            return false;
+
+        k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+               (float)inv_fx, (float)inv_fy);
     }
     else if (interpolation == INTER_AREA)
     {
-        int iscale_x = saturate_cast<int>(inv_fx);
-        int iscale_y = saturate_cast<int>(inv_fy);
-        bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
-                        std::abs(inv_fy - iscale_y) < DBL_EPSILON;
         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
         int wtype = CV_MAKE_TYPE(wdepth, cn);
 
         char cvt[2][40];
-        String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+        String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
 
@@ -2103,7 +2180,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         if (is_area_fast)
         {
             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
-            buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
                                                " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
                                                ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
                                                ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
@@ -2126,12 +2203,11 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         }
         else
         {
-            buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
             if (k.empty())
                 return false;
 
-            Size ssize = src.size();
             int xytab_size = (ssize.width + ssize.height) << 1;
             int tabofs_size = dsize.height + dsize.width + 2;
 
@@ -2161,11 +2237,6 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         return k.run(2, globalsize, NULL, false);
     }
 
-    if( k.empty() )
-        return false;
-    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
-           (float)inv_fx, (float)inv_fy);
-
     return k.run(2, globalsize, 0, false);
 }
 
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index ac958fc69..1dd0a252e 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -42,7 +42,6 @@
 
 #include "precomp.hpp"
 #include <limits.h>
-#include <stdio.h>
 #include "opencl_kernels.hpp"
 
 /****************************************************************************************\
@@ -1291,9 +1290,10 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
 {
     CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
 
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    if (_src.depth() == CV_64F && !doubleSupport)
+    if (depth == CV_64F && !doubleSupport)
         return false;
 
     UMat kernel8U;
@@ -1324,13 +1324,14 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
         return false;
 
     static const char * const op2str[] = { "ERODE", "DILATE" };
-    String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s%s -D GENTYPE=%s -D DEPTH_%d",
-                                 anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op],
+    String buildOptions = format("-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s%s%s"
+                                 " -D T=%s -D DEPTH_%d -D cn=%d -D T1=%s", anchor.x, anchor.y,
+                                 (int)localThreads[0], (int)localThreads[1], op2str[op],
                                  doubleSupport ? " -D DOUBLE_SUPPORT" : "", rectKernel ? " -D RECTKERNEL" : "",
-                                 ocl::typeToStr(_src.type()), _src.depth() );
+                                 ocl::typeToStr(_src.type()), _src.depth(), cn, ocl::typeToStr(depth));
 
     std::vector<ocl::Kernel> kernels;
-    for (int i = 0; i<iterations; i++)
+    for (int i = 0; i < iterations; i++)
     {
         ocl::Kernel k("morph", ocl::imgproc::morph_oclsrc, buildOptions);
         if (k.empty())
@@ -1341,38 +1342,35 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
     _dst.create(src.size(), src.type());
     UMat dst = _dst.getUMat();
 
-    if( iterations== 1 && src.u != dst.u)
+    if (iterations == 1 && src.u != dst.u)
     {
         Size wholesize;
         Point ofs;
         src.locateROI(wholesize, ofs);
         int wholecols = wholesize.width, wholerows = wholesize.height;
 
-        int idxArg = 0;
-        idxArg = kernels[0].set(idxArg, ocl::KernelArg::ReadOnlyNoSize(src));
-        idxArg = kernels[0].set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
-        idxArg = kernels[0].set(idxArg, ofs.x);
-        idxArg = kernels[0].set(idxArg, ofs.y);
-        idxArg = kernels[0].set(idxArg, src.cols);
-        idxArg = kernels[0].set(idxArg, src.rows);
-        idxArg = kernels[0].set(idxArg, ocl::KernelArg::PtrReadOnly(kernel8U));
-        idxArg = kernels[0].set(idxArg, wholecols);
-        idxArg = kernels[0].set(idxArg, wholerows);
+        kernels[0].args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst),
+                        ofs.x, ofs.y, src.cols, src.rows, ocl::KernelArg::PtrReadOnly(kernel8U),
+                        wholecols, wholerows);
 
         return kernels[0].run(2, globalThreads, localThreads, false);
     }
 
-    for(int i = 0; i< iterations; i++)
+    for (int i = 0; i < iterations; i++)
     {
         UMat source;
         Size wholesize;
         Point ofs;
-        if( i == 0)
+
+        if (i == 0)
         {
             int cols =  src.cols, rows = src.rows;
             src.locateROI(wholesize,ofs);
             src.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x);
-            src.copyTo(source);
+            if(src.u != dst.u)
+                source = src;
+            else
+                src.copyTo(source);
             src.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
             source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
         }
@@ -1385,20 +1383,11 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, Mat kernel,
             dst.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
             source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
         }
-
         source.locateROI(wholesize, ofs);
-        int wholecols = wholesize.width, wholerows = wholesize.height;
 
-        int idxArg = 0;
-        idxArg = kernels[i].set(idxArg, ocl::KernelArg::ReadOnlyNoSize(source));
-        idxArg = kernels[i].set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
-        idxArg = kernels[i].set(idxArg, ofs.x);
-        idxArg = kernels[i].set(idxArg, ofs.y);
-        idxArg = kernels[i].set(idxArg, source.cols);
-        idxArg = kernels[i].set(idxArg, source.rows);
-        idxArg = kernels[i].set(idxArg, ocl::KernelArg::PtrReadOnly(kernel8U));
-        idxArg = kernels[i].set(idxArg, wholecols);
-        idxArg = kernels[i].set(idxArg, wholerows);
+        kernels[i].args(ocl::KernelArg::ReadOnlyNoSize(source), ocl::KernelArg::WriteOnlyNoSize(dst),
+                        ofs.x, ofs.y, source.cols, source.rows, ocl::KernelArg::PtrReadOnly(kernel8U),
+                        wholesize.width, wholesize.height);
 
         if (!kernels[i].run(2, globalThreads, localThreads, false))
             return false;
@@ -1414,7 +1403,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
                      int borderType, const Scalar& borderValue )
 {
 #ifdef HAVE_OPENCL
-    int src_type = _src.type(), dst_type = _dst.type(),
+    int src_type = _src.type(),
         src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
 #endif
 
@@ -1427,13 +1416,13 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
         return;
 #endif
 
-    if( iterations == 0 || kernel.rows*kernel.cols == 1 )
+    if (iterations == 0 || kernel.rows*kernel.cols == 1)
     {
         _src.copyTo(_dst);
         return;
     }
 
-    if( !kernel.data )
+    if (!kernel.data)
     {
         kernel = getStructuringElement(MORPH_RECT, Size(1+iterations*2,1+iterations*2));
         anchor = Point(iterations, iterations);
@@ -1449,8 +1438,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
         iterations = 1;
     }
 
-    CV_OCL_RUN(_dst.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
-               _src.dims() <= 2 && (src_cn == 1 || src_cn == 4) &&
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && src_cn <= 4 &&
                (src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
                borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() &&
                (op == MORPH_ERODE || op == MORPH_DILATE),
diff --git a/modules/imgproc/src/opencl/bilateral.cl b/modules/imgproc/src/opencl/bilateral.cl
index f459cfc85..013be8015 100644
--- a/modules/imgproc/src/opencl/bilateral.cl
+++ b/modules/imgproc/src/opencl/bilateral.cl
@@ -32,6 +32,28 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 
+#if cn != 3
+#define loadpix(addr) *(__global const uchar_t *)(addr)
+#define storepix(val, addr)  *(__global uchar_t *)(addr) = val
+#define TSIZE cn
+#else
+#define loadpix(addr) vload3(0, (__global const uchar *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global uchar *)(addr))
+#define TSIZE 3
+#endif
+
+#if cn == 1
+#define SUM(a) a
+#elif cn == 2
+#define SUM(a) a.x + a.y
+#elif cn == 3
+#define SUM(a) a.x + a.y + a.z
+#elif cn == 4
+#define SUM(a) a.x + a.y + a.z + a.w
+#else
+#error "cn should be <= 4"
+#endif
+
 __kernel void bilateral(__global const uchar * src, int src_step, int src_offset,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                         __constant float * color_weight, __constant float * space_weight, __constant int * space_ofs)
@@ -41,19 +63,23 @@ __kernel void bilateral(__global const uchar * src, int src_step, int src_offset
 
     if (y < dst_rows && x < dst_cols)
     {
-        int src_index = mad24(y + radius, src_step, x + radius + src_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-        float sum = 0.f, wsum = 0.f;
-        int val0 = convert_int(src[src_index]);
+        int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
+
+        float_t sum = (float_t)(0.0f);
+        float wsum = 0.0f;
+        int_t val0 = convert_int_t(loadpix(src + src_index));
 
         #pragma unroll
         for (int k = 0; k < maxk; k++ )
         {
-            int val = convert_int(src[src_index + space_ofs[k]]);
-            float w = space_weight[k] * color_weight[abs(val - val0)];
-            sum += (float)(val) * w;
+            int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k]));
+            uint_t diff = abs(val - val0);
+            float w = space_weight[k] * color_weight[SUM(diff)];
+            sum += convert_float_t(val) * (float_t)(w);
             wsum += w;
         }
-        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
+
+        storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);
     }
 }
diff --git a/modules/imgproc/src/opencl/boxFilter.cl b/modules/imgproc/src/opencl/boxFilter.cl
index 986fc785c..00eec4600 100644
--- a/modules/imgproc/src/opencl/boxFilter.cl
+++ b/modules/imgproc/src/opencl/boxFilter.cl
@@ -47,6 +47,18 @@
 #endif
 #endif
 
+#if cn != 3
+#define loadpix(addr) *(__global const ST *)(addr)
+#define storepix(val, addr)  *(__global DT *)(addr) = val
+#define SRCSIZE (int)sizeof(ST)
+#define DSTSIZE (int)sizeof(DT)
+#else
+#define loadpix(addr) vload3(0, (__global const ST1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global DT1 *)(addr))
+#define SRCSIZE (int)sizeof(ST1)*cn
+#define DSTSIZE (int)sizeof(DT1)*cn
+#endif
+
 #ifdef BORDER_CONSTANT
 #elif defined BORDER_REPLICATE
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
@@ -123,8 +135,8 @@ inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, co
     if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #endif
     {
-        int src_index = mad24(pos.y, src_step, pos.x * (int)sizeof(ST));
-        WT value = convertToWT(*(__global const ST *)(srcptr + src_index));
+        int src_index = mad24(pos.y, src_step, pos.x * SRCSIZE);
+        WT value = convertToWT(loadpix(srcptr + src_index));
 
         return PROCESS_ELEM(value);
     }
@@ -143,8 +155,8 @@ inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, co
 #endif
             srcCoords.x2, srcCoords.y2);
 
-        int src_index = mad24(selected_row, src_step, selected_col * (int)sizeof(ST));
-        WT value = convertToWT(*(__global const ST *)(srcptr + src_index));
+        int src_index = mad24(selected_row, src_step, selected_col * SRCSIZE);
+        WT value = convertToWT(loadpix(srcptr + src_index));
 
         return PROCESS_ELEM(value);
 #endif
@@ -180,7 +192,7 @@ __kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffs
     sumOfCols[local_id] = tmp_sum;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    int dst_index = mad24(y, dst_step, x * (int)sizeof(DT) + dst_offset);
+    int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));
     __global DT * dst = (__global DT *)(dstptr + dst_index);
 
     int sy_index = 0; // current index in data[] array
@@ -196,10 +208,11 @@ __kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffs
                 total_sum += sumOfCols[local_id + sx - ANCHOR_X];
 
 #ifdef NORMALIZE
-            dst[0] = convertToDT((WT)(alpha) * total_sum);
+            DT dstval = convertToDT((WT)(alpha) * total_sum);
 #else
-            dst[0] = convertToDT(total_sum);
+            DT dstval = convertToDT(total_sum);
 #endif
+            storepix(dstval, dst);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
 
diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl
index 88b406f40..99cfc3b63 100644
--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@@ -43,6 +43,18 @@
 //
 //M*/
 
+#ifdef OP_SOBEL
+
+#if cn != 3
+#define loadpix(addr) convertToIntT(*(__global const ucharT *)(addr))
+#define storepix(val, addr) *(__global shortT *)(addr) = convertToShortT(val)
+#define shortSize (int)sizeof(shortT)
+#else
+#define loadpix(addr) convertToIntT(vload3(0, (__global const uchar *)(addr)))
+#define storepix(val, addr) vstore3(convertToShortT(val), 0, (__global short *)(addr))
+#define shortSize (int)sizeof(short) * cn
+#endif
+
 // Smoothing perpendicular to the derivative direction with a triangle filter
 // only support 3x3 Sobel kernel
 // h (-1) =  1, h (0) =  2, h (1) =  1
@@ -54,11 +66,9 @@
 // dx_buf	output dx buffer
 // dy_buf	output dy buffer
 
-__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
-calcSobelRowPass
-    (__global const uchar * src, int src_step, int src_offset, int rows, int cols,
-     __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
-     __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
+__kernel void calcSobelRowPass(__global const uchar * src, int src_step, int src_offset, int rows, int cols,
+                               __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
+                               __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
 {
     int gidx = get_global_id(0);
     int gidy = get_global_id(1);
@@ -66,34 +76,39 @@ calcSobelRowPass
     int lidx = get_local_id(0);
     int lidy = get_local_id(1);
 
-    __local int smem[16][18];
+    __local intT smem[16][18];
 
-    smem[lidy][lidx + 1] = src[mad24(src_step, min(gidy, rows - 1), gidx + src_offset)];
+    smem[lidy][lidx + 1] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(gidx, cn, src_offset)));
     if (lidx == 0)
     {
-        smem[lidy][0]  = src[mad24(src_step, min(gidy, rows - 1), max(gidx - 1,  0)        + src_offset)];
-        smem[lidy][17] = src[mad24(src_step, min(gidy, rows - 1), min(gidx + 16, cols - 1) + src_offset)];
+        smem[lidy][0]  = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(max(gidx - 1,  0), cn, src_offset)));
+        smem[lidy][17] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(min(gidx + 16, cols - 1), cn, src_offset)));
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (gidy < rows && gidx < cols)
     {
-        *(__global short *)(dx_buf + mad24(gidy, dx_buf_step, gidx * (int)sizeof(short) + dx_buf_offset)) =
-            smem[lidy][lidx + 2] - smem[lidy][lidx];
-        *(__global short *)(dy_buf + mad24(gidy, dy_buf_step, gidx * (int)sizeof(short) + dy_buf_offset)) =
-            smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+        storepix(smem[lidy][lidx + 2] - smem[lidy][lidx],
+                 dx_buf + mad24(gidy, dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
+        storepix(mad24(2, smem[lidy][lidx + 1], smem[lidy][lidx] + smem[lidy][lidx + 2]),
+                 dy_buf + mad24(gidy, dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
     }
 }
 
-inline int calc(short x, short y)
+#elif defined OP_MAG_BUF || defined OP_MAG
+
+inline intT calc(shortT x, shortT y)
 {
 #ifdef L2GRAD
-    return x * x + y * y;
+    intT intx = convertToIntT(x), inty = convertToIntT(y);
+    return intx * intx + inty * inty;
 #else
-    return (x >= 0 ? x : -x) + (y >= 0 ? y : -y);
+    return convertToIntT( (x >= (shortT)(0) ? x : -x) + (y >= (shortT)(0) ? y : -y) );
 #endif
 }
 
+#ifdef OP_MAG
+
 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the non-buffered version(non-3x3 sobel)
 //
@@ -112,18 +127,43 @@ __kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_of
 
     if (y < rows && x < cols)
     {
-        int dx_index = mad24(dx_step, y, x * (int)sizeof(short) + dx_offset);
-        int dy_index = mad24(dy_step, y, x * (int)sizeof(short) + dy_offset);
-        int mag_index = mad24(mag_step, y + 1, (x + 1) * (int)sizeof(int) + mag_offset);
+        int dx_index = mad24(dx_step, y, mad24(x, (int)sizeof(short) * cn, dx_offset));
+        int dy_index = mad24(dy_step, y, mad24(x, (int)sizeof(short) * cn, dy_offset));
+        int mag_index = mad24(mag_step, y + 1, mad24(x + 1, (int)sizeof(int), mag_offset));
 
-        __global const short * dx = (__global const short *)(dxptr + dx_index);
-        __global const short * dy = (__global const short *)(dyptr + dy_index);
+        __global short * dx = (__global short *)(dxptr + dx_index);
+        __global short * dy = (__global short *)(dyptr + dy_index);
         __global int * mag = (__global int *)(magptr + mag_index);
 
-        mag[0] = calc(dx[0], dy[0]);
+        int cmag = calc(dx[0], dy[0]);
+#if cn > 1
+        short cx = dx[0], cy = dy[0];
+        int pmag;
+
+        #pragma unroll
+        for (int i = 1; i < cn; ++i)
+        {
+            pmag = calc(dx[i], dy[i]);
+            if (pmag > cmag)
+                cmag = pmag, cx = dx[i], cy = dy[i];
+        }
+
+        dx[0] = cx, dy[0] = cy;
+#endif
+        mag[0] = cmag;
     }
 }
 
+#elif defined OP_MAG_BUF
+
+#if cn != 3
+#define loadpix(addr) *(__global const shortT *)(addr)
+#define shortSize (int)sizeof(shortT)
+#else
+#define loadpix(addr) vload3(0, (__global const short *)(addr))
+#define shortSize (int)sizeof(short)*cn
+#endif
+
 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the buffered version(3x3 sobel)
 //
@@ -132,59 +172,64 @@ __kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_of
 // dx			direvitive in x direction output
 // dy			direvitive in y direction output
 // mag			magnitude direvitive of xy output
-__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
-calcMagnitude_buf
-    (__global const short * dx_buf, int dx_buf_step, int dx_buf_offset,
-     __global const short * dy_buf, int dy_buf_step, int dy_buf_offset,
-     __global short * dx, int dx_step, int dx_offset,
-     __global short * dy, int dy_step, int dy_offset,
-     __global int * mag, int mag_step, int mag_offset,
-     int rows, int cols)
+__kernel void calcMagnitude_buf(__global const uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
+                                __global const uchar * dy_buf, int dy_buf_step, int dy_buf_offset,
+                                __global uchar * dx, int dx_step, int dx_offset,
+                                __global uchar * dy, int dy_step, int dy_offset,
+                                __global uchar * mag, int mag_step, int mag_offset, int rows, int cols)
 {
-    dx_buf_step    /= sizeof(*dx_buf);
-    dx_buf_offset  /= sizeof(*dx_buf);
-    dy_buf_step    /= sizeof(*dy_buf);
-    dy_buf_offset  /= sizeof(*dy_buf);
-    dx_step    /= sizeof(*dx);
-    dx_offset  /= sizeof(*dx);
-    dy_step    /= sizeof(*dy);
-    dy_offset  /= sizeof(*dy);
-    mag_step   /= sizeof(*mag);
-    mag_offset /= sizeof(*mag);
-
     int gidx = get_global_id(0);
     int gidy = get_global_id(1);
 
     int lidx = get_local_id(0);
     int lidy = get_local_id(1);
 
-    __local short sdx[18][16];
-    __local short sdy[18][16];
+    __local shortT sdx[18][16];
+    __local shortT sdy[18][16];
 
-    sdx[lidy + 1][lidx] = dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
-    sdy[lidy + 1][lidx] = dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
+    sdx[lidy + 1][lidx] = loadpix(dx_buf + mad24(min(gidy, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
+    sdy[lidy + 1][lidx] = loadpix(dy_buf + mad24(min(gidy, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
     if (lidy == 0)
     {
-        sdx[0][lidx]  = dx_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dx_buf_step + dx_buf_offset];
-        sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1)        * dx_buf_step + dx_buf_offset];
+        sdx[0][lidx]  = loadpix(dx_buf + mad24(clamp(gidy - 1, 0, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
+        sdx[17][lidx] = loadpix(dx_buf + mad24(min(gidy + 16, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
 
-        sdy[0][lidx]  = dy_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dy_buf_step + dy_buf_offset];
-        sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1)        * dy_buf_step + dy_buf_offset];
+        sdy[0][lidx]  = loadpix(dy_buf + mad24(clamp(gidy - 1, 0, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
+        sdy[17][lidx] = loadpix(dy_buf + mad24(min(gidy + 16, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (gidx < cols && gidy < rows)
     {
-        short x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
-        short y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+        shortT x = sdx[lidy + 1][lidx] * (shortT)(2) + sdx[lidy][lidx] + sdx[lidy + 2][lidx];
+        shortT y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
 
-        dx[gidx + gidy * dx_step + dx_offset] = x;
-        dy[gidx + gidy * dy_step + dy_offset] = y;
+#if cn == 1
+        *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = x;
+        *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = y;
 
-        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+        *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = calc(x, y);
+#elif cn == 3
+        intT magv = calc(x, y);
+        short cx = x.x, cy = y.x;
+        int cmag = magv.x;
+
+        if (cmag < magv.y)
+            cx = x.y, cy = y.y, cmag = magv.y;
+        if (cmag < magv.z)
+            cx = x.z, cy = y.z, cmag = magv.z;
+
+        *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = cx;
+        *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = cy;
+
+        *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = cmag;
+#endif
     }
 }
 
+#endif
+
+#elif defined OP_MAP
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // 0.4142135623730950488016887242097 is tan(22.5)
@@ -208,13 +253,11 @@ calcMagnitude_buf
 // mag			magnitudes calculated from calcMagnitude function
 // map			output containing raw edge types
 
-__kernel void __attribute__((reqd_work_group_size(16,16,1)))
-calcMap(
-    __global const uchar * dx, int dx_step, int dx_offset,
-    __global const uchar * dy, int dy_step, int dy_offset,
-    __global const uchar * mag, int mag_step, int mag_offset,
-    __global uchar * map, int map_step, int map_offset,
-    int rows, int cols, int low_thresh, int high_thresh)
+__kernel void calcMap(__global const uchar * dx, int dx_step, int dx_offset,
+                      __global const uchar * dy, int dy_step, int dy_offset,
+                      __global const uchar * mag, int mag_step, int mag_offset,
+                      __global uchar * map, int map_step, int map_offset,
+                      int rows, int cols, int low_thresh, int high_thresh)
 {
     __local int smem[18][18];
 
@@ -227,7 +270,7 @@ calcMap(
     int grp_idx = get_global_id(0) & 0xFFFFF0;
     int grp_idy = get_global_id(1) & 0xFFFFF0;
 
-    int tid = lidx + lidy * 16;
+    int tid = mad24(lidy, 16, lidx);
     int lx = tid % 18;
     int ly = tid / 18;
 
@@ -250,8 +293,8 @@ calcMap(
 
         if (m > low_thresh)
         {
-            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, dx_offset + (int)sizeof(short) * gidx));
-            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, dy_offset + (int)sizeof(short) * gidx));
+            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, mad24(gidx, (int)sizeof(short) * cn, dx_offset)));
+            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, mad24(gidx, (int)sizeof(short) * cn, dy_offset)));
             int x = abs(xs), y = abs(ys);
 
             int tg22x = x * TG22;
@@ -278,13 +321,15 @@ calcMap(
                 }
             }
         }
-        *(__global int *)(map + mad24(map_step, gidy + 1, (gidx + 1) * (int)sizeof(int) + map_offset)) = edge_type;
+        *(__global int *)(map + mad24(map_step, gidy + 1, mad24(gidx + 1, (int)sizeof(int), + map_offset))) = edge_type;
     }
 }
 
 #undef CANNY_SHIFT
 #undef TG22
 
+#elif defined OP_HYST_LOCAL
+
 struct PtrStepSz
 {
     __global uchar * ptr;
@@ -312,11 +357,9 @@ inline void set(struct PtrStepSz data, int y, int x, int value)
 // stack	the potiential edge points found in this kernel call
 // counter	the number of potiential edge points
 
-__kernel void __attribute__((reqd_work_group_size(16,16,1)))
-edgesHysteresisLocal
-    (__global uchar * map_ptr, int map_step, int map_offset,
-     __global ushort2 * st, __global unsigned int * counter,
-    int rows, int cols)
+__kernel void edgesHysteresisLocal(__global uchar * map_ptr, int map_step, int map_offset,
+                                   __global ushort2 * st, __global unsigned int * counter,
+                                   int rows, int cols)
 {
     struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 };
 
@@ -402,6 +445,8 @@ edgesHysteresisLocal
     }
 }
 
+#elif defined OP_HYST_GLOBAL
+
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
 __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
@@ -409,10 +454,9 @@ __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 #define stack_size 512
 #define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int))
 
-__kernel void __attribute__((reqd_work_group_size(128, 1, 1)))
-edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
-    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
-    int rows, int cols, int count)
+__kernel void edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
+                                    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
+                                    int rows, int cols, int count)
 {
     map += map_offset;
 
@@ -492,6 +536,8 @@ edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
 #undef map_index
 #undef stack_size
 
+#elif defined OP_EDGES
+
 // Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
 // map		edge type mappings
 // dst		edge output
@@ -504,7 +550,7 @@ __kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offs
 
     if (y < rows && x < cols)
     {
-        int map_index = mad24(map_step, y + 1, (x + 1) * (int)sizeof(int) + map_offset);
+        int map_index = mad24(map_step, y + 1, mad24(x + 1, (int)sizeof(int), map_offset));
         int dst_index = mad24(dst_step, y, x + dst_offset);
 
         __global const int * map = (__global const int *)(mapptr + map_index);
@@ -512,3 +558,5 @@ __kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offs
         dst[dst_index] = (uchar)(-(map[0] >> 1));
     }
 }
+
+#endif
diff --git a/modules/imgproc/src/opencl/filter2D.cl b/modules/imgproc/src/opencl/filter2D.cl
index d36071497..49657181f 100644
--- a/modules/imgproc/src/opencl/filter2D.cl
+++ b/modules/imgproc/src/opencl/filter2D.cl
@@ -122,7 +122,7 @@
     }
 #ifdef BORDER_REFLECT
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
-#elif defined(BORDER_REFLECT_101)
+#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
 #endif
 #else
@@ -142,109 +142,49 @@
     }
 #endif
 
-#if USE_DOUBLE
+#ifdef DOUBLE_SUPPORT
 #ifdef cl_amd_fp64
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
-#define FPTYPE double
-#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
-#else
-#define FPTYPE float
-#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
 #endif
 
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#define BASE_TYPE char
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#define BASE_TYPE short
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
+#if cn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
 #else
-#error data_depth
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1) * cn
+#define DSTSIZE (int)sizeof(dstT1) * cn
 #endif
 
-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define convert_uchar1_sat_rte convert_uchar_sat_rte
-#define convert_char1_sat_rte convert_char_sat_rte
-#define convert_ushort1_sat_rte convert_ushort_sat_rte
-#define convert_short1_sat_rte convert_short_sat_rte
-#define convert_int1_sat_rte convert_int_sat_rte
-#define convert_float1
-#define convert_double1
-
-#if DATA_DEPTH == 5 || DATA_DEPTH == 6
-#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
-#else
-#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
-#endif
-
-#define VEC_SIZE DATA_CHAN
-
-#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-#define TYPE VEC_TYPE
-
-#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
+#define noconvert
 
 struct RectCoords
 {
     int x1, y1, x2, y2;
 };
 
-//#define DEBUG
-#ifdef DEBUG
-#define DEBUG_ONLY(x) x
-#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
-#else
-#define DEBUG_ONLY(x) (void)0
-#define ASSERT(condition) (void)0
-#endif
-
-
-inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global const uchar* srcptr, int srcstep, const struct RectCoords srcCoords
-#ifdef BORDER_CONSTANT
-               , SCALAR_TYPE borderValue
-#endif
-    )
+inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, const struct RectCoords srcCoords)
 {
 #ifdef BORDER_ISOLATED
-    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #else
-    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #endif
     {
-        //__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-        __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-        return CONVERT_TO_FPTYPE(*ptr);
+        return convertToWT(loadpix(srcptr + mad24(pos.y, src_step, pos.x * SRCSIZE)));
     }
     else
     {
 #ifdef BORDER_CONSTANT
-        return borderValue;
+        return (WT)(0);
 #else
-        int selected_col = pos.x;
-        int selected_row = pos.y;
+        int selected_col = pos.x, selected_row = pos.y;
 
         EXTRAPOLATE(selected_col, selected_row,
 #ifdef BORDER_ISOLATED
@@ -255,68 +195,43 @@ inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global const uchar* srcptr, in
                 srcCoords.x2, srcCoords.y2
          );
 
-        // debug border mapping
-        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
-
-        pos = (int2)(selected_col, selected_row);
-        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-        {
-            //__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-            __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-            return CONVERT_TO_FPTYPE(*ptr);
-        }
-        else
-        {
-            // for debug only
-            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
-            return (FPTYPE)(0.0f);
-        }
+        return convertToWT(loadpix(srcptr + mad24(selected_row, src_step, selected_col * SRCSIZE)));
 #endif
     }
 }
 
-// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
+#define DIG(a) a,
+__constant WT1 kernelData[] = { COEFF };
 
-__kernel
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-void filter2D(__global const uchar* srcptr, int srcstep, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
-                __global uchar* dstptr, int dststep, int dstoffset,
-               int rows, int cols,
-#ifdef BORDER_CONSTANT
-               SCALAR_TYPE borderValue,
-#endif
-               __constant FPTYPE* kernelData // transposed: [KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED]
-               )
+__kernel void filter2D(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
+                       __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)
 {
-    const struct RectCoords srcCoords = {srcOffsetX, srcOffsetY, srcEndX, srcEndY}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
+    const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
 
-    const int local_id = get_local_id(0);
-    const int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
-    const int y = get_global_id(1) * BLOCK_SIZE_Y;
+    int local_id = get_local_id(0);
+    int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
+    int y = get_global_id(1) * BLOCK_SIZE_Y;
 
-    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
-    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
+    WT data[KERNEL_SIZE_Y];
+    __local WT sumOfCols[LOCAL_SIZE];
 
     int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
 
     int2 pos = (int2)(x, y);
-    __global TYPE* dstPtr = (__global TYPE*)((__global char*)dstptr + pos.y * dststep + dstoffset + pos.x * sizeof(TYPE)); // Pointer can be out of bounds!
-    bool writeResult = ((local_id >= ANCHOR_X) && (local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X)) &&
-                        (pos.x >= 0) && (pos.x < cols));
+    __global dstT * dst = (__global dstT *)(dstptr + mad24(pos.y, dst_step, mad24(pos.x, DSTSIZE, dst_offset))); // Pointer can be out of bounds!
+    bool writeResult = local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
+                        pos.x >= 0 && pos.x < cols;
 
 #if BLOCK_SIZE_Y > 1
     bool readAllpixels = true;
     int sy_index = 0; // current index in data[] array
 
     dstRowsMax = min(rows, pos.y + BLOCK_SIZE_Y);
-    for (;
-         pos.y < dstRowsMax;
-         pos.y++,
-         dstPtr = (__global TYPE*)((__global char*)dstptr + dststep))
+    for ( ;
+          pos.y < dstRowsMax;
+          pos.y++, dst = (__global dstT *)((__global uchar *)dst + dst_step))
 #endif
     {
-        ASSERT(pos.y < dstRowsMax);
-
         for (
 #if BLOCK_SIZE_Y > 1
             int sy = readAllpixels ? 0 : -1; sy < (readAllpixels ? KERNEL_SIZE_Y : 0);
@@ -325,27 +240,21 @@ void filter2D(__global const uchar* srcptr, int srcstep, int srcOffsetX, int src
 #endif
             sy++, srcPos.y++)
         {
-            data[sy + sy_index] = readSrcPixel(srcPos, srcptr, srcstep, srcCoords
-#ifdef BORDER_CONSTANT
-                    , borderValue
-#endif
-                    );
+            data[sy + sy_index] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);
         }
 
-        INTERMEDIATE_TYPE total_sum = 0;
+        WT total_sum = 0;
         for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
         {
             {
-                __constant FPTYPE* k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
+                __constant WT1 * k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
 #if BLOCK_SIZE_Y > 1
                                                    + KERNEL_SIZE_Y - sy_index
 #endif
                                                    ];
-                INTERMEDIATE_TYPE tmp_sum = 0;
+                WT tmp_sum = 0;
                 for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
-                {
                     tmp_sum += data[sy] * k[sy];
-                }
 
                 sumOfCols[local_id] = tmp_sum;
                 barrier(CLK_LOCAL_MEM_FENCE);
@@ -359,14 +268,12 @@ void filter2D(__global const uchar* srcptr, int srcstep, int srcOffsetX, int src
         }
 
         if (writeResult)
-        {
-            *dstPtr = CONVERT_TO_TYPE(total_sum);
-        }
+            storepix(convertToDstT(total_sum + (WT)(delta)), dst);
 
 #if BLOCK_SIZE_Y > 1
         readAllpixels = false;
 #if BLOCK_SIZE_Y > KERNEL_SIZE_Y
-        sy_index = (sy_index + 1 <= KERNEL_SIZE_Y) ? sy_index + 1 : 1;
+        sy_index = sy_index + 1 <= KERNEL_SIZE_Y ? sy_index + 1 : 1;
 #else
         sy_index++;
 #endif
diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl
index 30a2221cf..29514cc21 100644
--- a/modules/imgproc/src/opencl/filterSepCol.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@@ -34,47 +34,36 @@
 //
 //
 
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
 #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
 #define RADIUS 1
-#if CN ==1
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==2
-#define ALIGN (((RADIUS)+1)>>1<<1)
-#elif CN==3
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==4
-#define ALIGN (RADIUS)
-#define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0)
-#endif
 
 #define noconvert
 
-/**********************************************************************************
-These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
-Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
-kernel must be in the center. ROI is not supported either.
-Each kernels read 4 elements(not 4 pixels), save them to LDS and read the data needed
-from LDS to calculate the result.
-The length of the convovle kernel supported is only related to the MAX size of LDS,
-which is HW related.
-Niko
-6/29/2011
-The info above maybe obsolete.
-***********************************************************************************/
+#if CN != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1)*3
+#define DSTSIZE (int)sizeof(dstT1)*3
+#endif
 
 #define DIG(a) a,
 __constant float mat_kernel[] = { COEFF };
 
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
-                        (__global const GENTYPE_SRC * restrict src,
-                         const int src_step_in_pixel,
-                         const int src_whole_cols,
-                         const int src_whole_rows,
-                         __global GENTYPE_DST * dst,
-                         const int dst_offset_in_pixel,
-                         const int dst_step_in_pixel,
-                         const int dst_cols,
-                         const int dst_rows)
+__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
+                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -82,38 +71,38 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
     int l_x = get_local_id(0);
     int l_y = get_local_id(1);
 
-    int start_addr = mad24(y, src_step_in_pixel, x);
-    int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
+    int start_addr = mad24(y, src_step, x * SRCSIZE);
+    int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
 
-    int i;
-    GENTYPE_SRC sum, temp[READ_TIMES_COL];
-    __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
+    srcT sum, temp[READ_TIMES_COL];
+    __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];
 
-    //read pixels from src
-    for(i = 0;i<READ_TIMES_COL;i++)
+    // read pixels from src
+    for (int i = 0; i < READ_TIMES_COL; ++i)
     {
-        int current_addr = start_addr+i*LSIZE1*src_step_in_pixel;
+        int current_addr = mad24(i, LSIZE1 * src_step, start_addr);
         current_addr = current_addr < end_addr ? current_addr : 0;
-        temp[i] = src[current_addr];
-    }
-    //save pixels to lds
-    for(i = 0;i<READ_TIMES_COL;i++)
-    {
-        LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i];
+        temp[i] = loadpix(src + current_addr);
     }
+
+    // save pixels to lds
+    for (int i = 0; i < READ_TIMES_COL; ++i)
+        LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];
     barrier(CLK_LOCAL_MEM_FENCE);
-    //read pixels from lds and calculate the result
-    sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY];
-    for(i=1;i<=RADIUSY;i++)
+
+    // read pixels from lds and calculate the result
+    sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
+    for (int i = 1; i <= RADIUSY; ++i)
     {
-        temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x];
-        temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x];
-        sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i];
+        temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
+        temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
+        sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
     }
-    //write the result to dst
-    if((x<dst_cols) & (y<dst_rows))
+
+    // write the result to dst
+    if (x < dst_cols && y < dst_rows)
     {
-        start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
-        dst[start_addr] = convert_to_DST(sum);
+        start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
+        storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr);
     }
 }
diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl
index d0623f590..726de448e 100644
--- a/modules/imgproc/src/opencl/filterSepRow.cl
+++ b/modules/imgproc/src/opencl/filterSepRow.cl
@@ -34,41 +34,37 @@
 //
 //
 
-#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only
-#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)
-//#pragma OPENCL EXTENSION cl_amd_printf : enable
-#define RADIUS 1
-#if CN ==1
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==2
-#define ALIGN (((RADIUS)+1)>>1<<1)
-#elif CN==3
-#define ALIGN (((RADIUS)+3)>>2<<2)
-#elif CN==4
-#define ALIGN (RADIUS)
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
 #endif
 
+#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only
+#define RADIUS 1
+
 #ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+// BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
 #endif
 
 #ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+// BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
 #endif
 
 #ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+// BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
 #endif
 
-//blur function does not support BORDER_WRAP
 #ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+// BORDER_WRAP: cdefgh|abcdefgh|abcdefg
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
 #endif
@@ -127,65 +123,56 @@
     #endif //BORDER_CONSTANT
 #endif //EXTRA_EXTRAPOLATION
 
-/**********************************************************************************
-These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
-Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
-kernel must be in the center. ROI is not supported either.
-For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3,
-the kernel read 4 pixels, save them to LDS and read the data needed from LDS to
-calculate the result.
-The length of the convovle kernel supported is related to the LSIZE0 and the MAX size
-of LDS, which is HW related.
-For channels = 1,3 the RADIUS is no more than LSIZE0*2
-For channels = 2, the RADIUS is no more than LSIZE0
-For channels = 4, arbitary RADIUS is supported unless the LDS is not enough
-Niko
-6/29/2011
-The info above maybe obsolete.
-***********************************************************************************/
+#define noconvert
+
+#if CN != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1)*3
+#define DSTSIZE (int)sizeof(dstT1)*3
+#endif
 
 #define DIG(a) a,
 __constant float mat_kernel[] = { COEFF };
 
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
-    (__global uchar * restrict src,
-     int src_step_in_pixel,
-     int src_offset_x, int src_offset_y,
-     int src_cols, int src_rows,
-     int src_whole_cols, int src_whole_rows,
-     __global float * dst,
-     int dst_step_in_pixel,
-     int dst_cols, int dst_rows,
-     int radiusy)
+__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
+                               int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
+                               __global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
+                               int radiusy)
 {
     int x = get_global_id(0)<<2;
     int y = get_global_id(1);
     int l_x = get_local_id(0);
     int l_y = get_local_id(1);
 
-    int start_x = x+src_offset_x - RADIUSX & 0xfffffffc;
+    int start_x = x + src_offset_x - RADIUSX & 0xfffffffc;
     int offset = src_offset_x - RADIUSX & 3;
     int start_y = y + src_offset_y - radiusy;
     int start_addr = mad24(start_y, src_step_in_pixel, start_x);
-    int i;
+
     float4 sum;
     uchar4 temp[READ_TIMES_ROW];
 
-    __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+    __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
 #ifdef BORDER_CONSTANT
     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);
 
     // read pixels from src
-    for (i = 0; i < READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
     {
-        int current_addr = start_addr+i*LSIZE0*4;
-        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-        temp[i] = *(__global uchar4*)&src[current_addr];
+        int current_addr = mad24(i, LSIZE0 << 2, start_addr);
+        current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;
+        temp[i] = *(__global const uchar4 *)&src[current_addr];
     }
 
     // judge if read out of boundary
 #ifdef BORDER_ISOLATED
-    for (i = 0; i<READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
     {
         temp[i].x = ELEM(start_x+i*LSIZE0*4,   src_offset_x, src_offset_x + src_cols, 0,         temp[i].x);
         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0,         temp[i].y);
@@ -194,7 +181,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
         temp[i]   = ELEM(start_y,              src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
     }
 #else
-    for (i = 0; i<READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
     {
         temp[i].x = ELEM(start_x+i*LSIZE0*4,   0, src_whole_cols, 0,         temp[i].x);
         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0,         temp[i].y);
@@ -209,16 +196,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
 #else
     int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
 #endif
-    int4 index[READ_TIMES_ROW];
-    int4 addr;
+    int4 index[READ_TIMES_ROW], addr;
     int s_y;
 
     if (not_all_in_range)
     {
         // judge if read out of boundary
-        for (i = 0; i < READ_TIMES_ROW; i++)
+        for (int i = 0; i < READ_TIMES_ROW; ++i)
         {
-            index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3);
+            index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3);
 #ifdef BORDER_ISOLATED
             EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols);
             EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols);
@@ -231,6 +217,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
             EXTRAPOLATE(index[i].w, 0, src_whole_cols);
 #endif
         }
+
         s_y = start_y;
 #ifdef BORDER_ISOLATED
         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
@@ -239,9 +226,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
 #endif
 
         // read pixels from src
-        for (i = 0; i<READ_TIMES_ROW; i++)
+        for (int i = 0; i < READ_TIMES_ROW; ++i)
         {
-            addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]);
+            addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]);
             temp[i].x = src[addr.x];
             temp[i].y = src[addr.y];
             temp[i].z = src[addr.z];
@@ -251,26 +238,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     else
     {
         // read pixels from src
-        for (i = 0; i<READ_TIMES_ROW; i++)
-            temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
+        for (int i = 0; i < READ_TIMES_ROW; ++i)
+            temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)];
     }
 #endif //BORDER_CONSTANT
 
     // save pixels to lds
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
+        LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // read pixels from lds and calculate the result
-    sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX];
-    for (i=1; i<=RADIUSX; i++)
+    sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
+    for (int i = 1; i <= RADIUSX; ++i)
     {
         temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
         temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
-        sum += convert_float4(temp[0]) * mat_kernel[RADIUSX-i] + convert_float4(temp[1]) * mat_kernel[RADIUSX+i];
+        sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]);
     }
 
-    start_addr = mad24(y,dst_step_in_pixel,x);
+    start_addr = mad24(y, dst_step_in_pixel, x);
 
     // write the result to dst
     if ((x+3<dst_cols) & (y<dst_rows))
@@ -290,63 +277,58 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
         dst[start_addr] = sum.x;
 }
 
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
-    (__global uchar4 * restrict src,
-     int src_step_in_pixel,
-     int src_offset_x, int src_offset_y,
-     int src_cols, int src_rows,
-     int src_whole_cols, int src_whole_rows,
-     __global float4 * dst,
-     int dst_step_in_pixel,
-     int dst_cols, int dst_rows,
-     int radiusy)
+__kernel void row_filter(__global const uchar * src, int src_step, int src_offset_x, int src_offset_y,
+                         int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
+                         __global uchar * dst, int dst_step, int dst_cols, int dst_rows,
+                         int radiusy)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
     int l_x = get_local_id(0);
     int l_y = get_local_id(1);
-    int start_x = x+src_offset_x-RADIUSX;
-    int start_y = y+src_offset_y-radiusy;
-    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-    int i;
-    float4 sum;
-    uchar4 temp[READ_TIMES_ROW];
 
-    __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
+    int start_x = x + src_offset_x - RADIUSX;
+    int start_y = y + src_offset_y - radiusy;
+    int start_addr = mad24(start_y, src_step, start_x * SRCSIZE);
+
+    dstT sum;
+    srcT temp[READ_TIMES_ROW];
+
+    __local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
 #ifdef BORDER_CONSTANT
-    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);
 
     // read pixels from src
-    for (i = 0; i<READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; i++)
     {
-        int current_addr = start_addr+i*LSIZE0;
-        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-        temp[i] = src[current_addr];
+        int current_addr = mad24(i, LSIZE0 * SRCSIZE, start_addr);
+        current_addr = current_addr < end_addr && current_addr >= 0 ? current_addr : 0;
+        temp[i] = loadpix(src + current_addr);
     }
 
-    //judge if read out of boundary
+    // judge if read out of boundary
 #ifdef BORDER_ISOLATED
-    for (i = 0; i<READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
     {
-        temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (uchar4)0, temp[i]);
-        temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
+        temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]);
+        temp[i] = ELEM(start_y,                   src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]);
     }
 #else
-    for (i = 0; i<READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
     {
-        temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (uchar4)0, temp[i]);
-        temp[i]= ELEM(start_y,          0, src_whole_rows, (uchar4)0, temp[i]);
+        temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]);
+        temp[i] = ELEM(start_y,                   0, src_whole_rows, (srcT)(0), temp[i]);
     }
 #endif
 #else
-    int index[READ_TIMES_ROW];
-    int s_x,s_y;
+    int index[READ_TIMES_ROW], s_x, s_y;
 
     // judge if read out of boundary
-    for (i = 0; i<READ_TIMES_ROW; i++)
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
     {
-        s_x = start_x+i*LSIZE0;
+        s_x = mad24(i, LSIZE0, start_x);
         s_y = start_y;
+
 #ifdef BORDER_ISOLATED
         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
@@ -354,216 +336,32 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
         EXTRAPOLATE(s_x, 0, src_whole_cols);
         EXTRAPOLATE(s_y, 0, src_whole_rows);
 #endif
-        index[i]=mad24(s_y, src_step_in_pixel, s_x);
+        index[i] = mad24(s_y, src_step, s_x * SRCSIZE);
     }
 
-    //read pixels from src
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        temp[i] = src[index[i]];
-#endif //BORDER_CONSTANT
-
-    //save pixels to lds
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    //read pixels from lds and calculate the result
-    sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX];
-    for (i=1; i<=RADIUSX; i++)
-    {
-        temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
-        temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
-        sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i];
-    }
-    //write the result to dst
-    if (x<dst_cols && y<dst_rows)
-    {
-        start_addr = mad24(y,dst_step_in_pixel,x);
-        dst[start_addr] = sum;
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
-    (__global float * restrict src,
-     int src_step_in_pixel,
-     int src_offset_x, int src_offset_y,
-     int src_cols, int src_rows,
-     int src_whole_cols, int src_whole_rows,
-     __global float * dst,
-     int dst_step_in_pixel,
-     int dst_cols, int dst_rows,
-     int radiusy)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int l_x = get_local_id(0);
-    int l_y = get_local_id(1);
-    int start_x = x+src_offset_x-RADIUSX;
-    int start_y = y+src_offset_y-radiusy;
-    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-    int i;
-    float sum;
-    float temp[READ_TIMES_ROW];
-
-    __local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
-#ifdef BORDER_CONSTANT
-    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-
     // read pixels from src
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        int current_addr = start_addr+i*LSIZE0;
-        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-        temp[i] = src[current_addr];
-    }
-
-    // judge if read out of boundary
-#ifdef BORDER_ISOLATED
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float)0,temp[i]);
-        temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float)0,temp[i]);
-    }
-#else
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float)0,temp[i]);
-        temp[i]= ELEM(start_y,          0, src_whole_rows, (float)0,temp[i]);
-    }
-#endif
-#else // BORDER_CONSTANT
-    int index[READ_TIMES_ROW];
-    int s_x,s_y;
-    // judge if read out of boundary
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        s_x = start_x + i*LSIZE0, s_y = start_y;
-#ifdef BORDER_ISOLATED
-        EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
-        EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
-#else
-        EXTRAPOLATE(s_x, 0, src_whole_cols);
-        EXTRAPOLATE(s_y, 0, src_whole_rows);
-#endif
-
-        index[i]=mad24(s_y, src_step_in_pixel, s_x);
-    }
-    // read pixels from src
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        temp[i] = src[index[i]];
-#endif// BORDER_CONSTANT
-
-    //save pixels to lds
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // read pixels from lds and calculate the result
-    sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
-    for (i=1; i<=RADIUSX; i++)
-    {
-        temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
-        temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
-        sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
-    }
-
-    // write the result to dst
-    if (x<dst_cols && y<dst_rows)
-    {
-        start_addr = mad24(y,dst_step_in_pixel,x);
-        dst[start_addr] = sum;
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
-    (__global float4 * restrict src,
-     int src_step_in_pixel,
-     int src_offset_x, int src_offset_y,
-     int src_cols, int src_rows,
-     int src_whole_cols, int src_whole_rows,
-     __global float4 * dst,
-     int dst_step_in_pixel,
-     int dst_cols, int dst_rows,
-     int radiusy)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int l_x = get_local_id(0);
-    int l_y = get_local_id(1);
-    int start_x = x+src_offset_x-RADIUSX;
-    int start_y = y+src_offset_y-radiusy;
-    int start_addr = mad24(start_y,src_step_in_pixel,start_x);
-    int i;
-    float4 sum;
-    float4 temp[READ_TIMES_ROW];
-
-    __local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1];
-#ifdef BORDER_CONSTANT
-    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-
-    // read pixels from src
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        int current_addr = start_addr+i*LSIZE0;
-        current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0;
-        temp[i] = src[current_addr];
-    }
-
-    // judge if read out of boundary
-#ifdef BORDER_ISOLATED
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float4)0,temp[i]);
-        temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float4)0,temp[i]);
-    }
-#else
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float4)0,temp[i]);
-        temp[i]= ELEM(start_y,          0, src_whole_rows, (float4)0,temp[i]);
-    }
-#endif
-#else
-    int index[READ_TIMES_ROW];
-    int s_x,s_y;
-
-    // judge if read out of boundary
-    for (i = 0; i<READ_TIMES_ROW; i++)
-    {
-        s_x = start_x + i*LSIZE0, s_y = start_y;
-#ifdef BORDER_ISOLATED
-        EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
-        EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
-#else
-        EXTRAPOLATE(s_x, 0, src_whole_cols);
-        EXTRAPOLATE(s_y, 0, src_whole_rows);
-#endif
-
-        index[i]=mad24(s_y,src_step_in_pixel,s_x);
-    }
-    // read pixels from src
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        temp[i] = src[index[i]];
-#endif
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
+        temp[i] = loadpix(src + index[i]);
+#endif // BORDER_CONSTANT
 
     // save pixels to lds
-    for (i = 0; i<READ_TIMES_ROW; i++)
-        LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i];
+    for (int i = 0; i < READ_TIMES_ROW; ++i)
+        LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // read pixels from lds and calculate the result
-    sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX];
-    for (i=1; i<=RADIUSX; i++)
+    sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX];
+    for (int i = 1; i <= RADIUSX; ++i)
     {
-        temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i];
-        temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i];
-        sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i];
+        temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
+        temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
+        sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
     }
 
     // write the result to dst
-    if (x<dst_cols && y<dst_rows)
+    if (x < dst_cols && y < dst_rows)
     {
-        start_addr = mad24(y,dst_step_in_pixel,x);
-        dst[start_addr] = sum;
+        start_addr = mad24(y, dst_step, x * DSTSIZE);
+        storepix(sum, dst + start_addr);
     }
 }
diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl
new file mode 100644
index 000000000..e75574035
--- /dev/null
+++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl
@@ -0,0 +1,187 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2014, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef BORDER_CONSTANT
+// CCCCCC|abcdefgh|CCCCCCC
+#define EXTRAPOLATE(x, maxV)
+#elif defined BORDER_REPLICATE
+// aaaaaa|abcdefgh|hhhhhhh
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = max(min((x), (maxV) - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+// cdefgh|abcdefgh|abcdefg
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = ( (x) + (maxV) ) % (maxV); \
+    }
+#elif defined BORDER_REFLECT
+// fedcba|abcdefgh|hgfedcb
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
+    }
+#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
+// gfedcb|abcdefgh|gfedcba
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
+    }
+#else
+#error No extrapolation method
+#endif
+
+#if CN != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define SRCSIZE (int)sizeof(srcT)
+#define DSTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define SRCSIZE (int)sizeof(srcT1)*3
+#define DSTSIZE (int)sizeof(dstT1)*3
+#endif
+
+#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
+
+#ifdef BORDER_CONSTANT
+// CCCCCC|abcdefgh|CCCCCCC
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
+#else
+#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
+#endif
+
+#define noconvert
+
+// horizontal and vertical filter kernels
+// should be defined on host during compile time to avoid overhead
+#define DIG(a) a,
+__constant float mat_kernelX[] = { KERNEL_MATRIX_X };
+__constant float mat_kernelY[] = { KERNEL_MATRIX_Y };
+
+__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
+                         __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
+{
+    // RADIUSX, RADIUSY are filter dimensions
+    // BLK_X, BLK_Y are local wrogroup sizes
+    // all these should be defined on host during compile time
+    // first lsmem array for source pixels used in first pass,
+    // second lsmemDy for storing first pass results
+    __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
+    __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
+
+    // get local and global ids - used as image and local memory array indexes
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // calculate pixel position in source image taking image offset into account
+    int srcX = x + srcOffsetX - RADIUSX;
+    int srcY = y + srcOffsetY - RADIUSY;
+    int xb = srcX;
+    int yb = srcY;
+
+    // extrapolate coordinates, if needed
+    // and read my own source pixel into local memory
+    // with account for extra border pixels, which will be read by starting workitems
+    int clocY = liy;
+    int cSrcY = srcY;
+    do
+    {
+        int yb = cSrcY;
+        EXTRAPOLATE(yb, (height));
+
+        int clocX = lix;
+        int cSrcX = srcX;
+        do
+        {
+            int xb = cSrcX;
+            EXTRAPOLATE(xb,(width));
+            lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
+
+            clocX += BLK_X;
+            cSrcX += BLK_X;
+        }
+        while(clocX < BLK_X+(RADIUSX*2));
+
+        clocY += BLK_Y;
+        cSrcY += BLK_Y;
+    }
+    while (clocY < BLK_Y+(RADIUSY*2));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do vertical filter pass
+    // and store intermediate results to second local memory array
+    int i, clocX = lix;
+    WT sum = 0.0f;
+    do
+    {
+        sum = 0.0f;
+        for (i=0; i<=2*RADIUSY; i++)
+            sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
+        lsmemDy[liy][clocX] = sum;
+        clocX += BLK_X;
+    }
+    while(clocX < BLK_X+(RADIUSX*2));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // if this pixel happened to be out of image borders because of global size rounding,
+    // then just return
+    if( x >= dst_cols || y >=dst_rows )
+        return;
+
+    // do second horizontal filter pass
+    // and calculate final result
+    sum = 0.0f;
+    for (i=0; i<=2*RADIUSX; i++)
+        sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
+
+    // store result into destination image
+    storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
+}
diff --git a/modules/imgproc/src/opencl/laplacian5.cl b/modules/imgproc/src/opencl/laplacian5.cl
new file mode 100644
index 000000000..3e15e097c
--- /dev/null
+++ b/modules/imgproc/src/opencl/laplacian5.cl
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#define noconvert
+
+__kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1_offset,
+                         __global const uchar * src2ptr, int src2_step, int src2_offset,
+                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                         coeffT scale, coeffT delta)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(srcT), src1_offset));
+        int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(srcT), src2_offset));
+        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
+
+        __global const srcT * src1 = (__global const srcT *)(src1ptr + src1_index);
+        __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+
+#if wdepth <= 4
+        dst[0] = convertToDT( mad24((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
+#else
+        dst[0] = convertToDT( mad((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
+#endif
+    }
+}
diff --git a/modules/imgproc/src/opencl/medianFilter.cl b/modules/imgproc/src/opencl/medianFilter.cl
index 6aed88ef7..c1ab04545 100644
--- a/modules/imgproc/src/opencl/medianFilter.cl
+++ b/modules/imgproc/src/opencl/medianFilter.cl
@@ -29,52 +29,52 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 
-#define DATA_TYPE type
+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE (int)sizeof(T1) * cn
+#endif
 
-#define scnbytes ((int)sizeof(type))
+#define op(a, b) { mid = a; a = min(a, b); b = max(mid, b); }
 
-#define op(a,b) {    mid=a; a=min(a,b); b=max(mid,b);}
-
-__kernel void medianFilter3(__global const uchar* srcptr, int srcStep, int srcOffset,
-                            __global uchar* dstptr, int dstStep, int dstOffset,
-                            int rows, int cols)
+__kernel void medianFilter3(__global const uchar * srcptr, int src_step, int src_offset,
+                            __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
-    __local DATA_TYPE data[18][18];
+    __local T data[18][18];
 
     int x = get_local_id(0);
     int y = get_local_id(1);
 
-    int gx= get_global_id(0);
-    int gy= get_global_id(1);
+    int gx = get_global_id(0);
+    int gy = get_global_id(1);
 
     int dx = gx - x - 1;
     int dy = gy - y - 1;
 
-    const int id = min((int)(x*16+y), 9*18-1);
+    int id = min(mad24(x, 16, y), 9*18-1);
 
     int dr = id / 18;
     int dc = id % 18;
 
-    int c = clamp(dx+dc, 0, cols-1);
+    int c = clamp(dx + dc, 0, dst_cols - 1);
 
-    int r = clamp(dy+dr, 0, rows-1);
-    int index1 = mad24(r, srcStep, srcOffset + c*scnbytes);
-
-    r = clamp(dy+dr+9, 0, rows-1);
-    int index9 = mad24(r, srcStep, srcOffset + c*scnbytes);
-
-    __global DATA_TYPE * src = (__global DATA_TYPE *)(srcptr + index1);
-    data[dr][dc] = src[0];
-
-    src = (__global DATA_TYPE *)(srcptr + index9);
-    data[dr+9][dc] = src[0];
+    int r = clamp(dy + dr, 0, dst_rows - 1);
+    int index1 = mad24(r, src_step, mad24(c, TSIZE, src_offset));
+    r = clamp(dy + dr + 9, 0, dst_rows - 1);
+    int index9 = mad24(r, src_step, mad24(c, TSIZE, src_offset));
 
+    data[dr][dc] = loadpix(srcptr + index1);
+    data[dr+9][dc] = loadpix(srcptr + index9);
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    DATA_TYPE p0=data[y][x], p1=data[y][(x+1)], p2=data[y][(x+2)];
-    DATA_TYPE p3=data[y+1][x], p4=data[y+1][(x+1)], p5=data[y+1][(x+2)];
-    DATA_TYPE p6=data[y+2][x], p7=data[y+2][(x+1)], p8=data[y+2][(x+2)];
-    DATA_TYPE mid;
+    T p0 = data[y][x], p1 = data[y][(x+1)], p2 = data[y][(x+2)];
+    T p3 = data[y+1][x], p4 = data[y+1][(x+1)], p5 = data[y+1][(x+2)];
+    T p6 = data[y+2][x], p7 = data[y+2][(x+1)], p8 = data[y+2][(x+2)];
+    T mid;
 
     op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
     op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
@@ -82,56 +82,48 @@ __kernel void medianFilter3(__global const uchar* srcptr, int srcStep, int srcOf
     op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
     op(p4, p2); op(p6, p4); op(p4, p2);
 
-    int dst_index = mad24( gy, dstStep, dstOffset + gx * scnbytes);
+    int dst_index = mad24( gy, dst_step, mad24(gx, TSIZE, dst_offset));
 
-    if( gy < rows && gx < cols)
-    {
-        __global DATA_TYPE* dst = (__global DATA_TYPE *)(dstptr + dst_index);
-        dst[0] = p4;
-    }
+    if (gy < dst_rows && gx < dst_cols)
+        storepix(p4, dstptr + dst_index);
 }
 
-__kernel void medianFilter5(__global const uchar* srcptr, int srcStep, int srcOffset,
-                            __global uchar* dstptr, int dstStep, int dstOffset,
-                            int rows, int cols)
+__kernel void medianFilter5(__global const uchar * srcptr, int src_step, int src_offset,
+                            __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
-    __local DATA_TYPE data[20][20];
+    __local T data[20][20];
 
-    int x =get_local_id(0);
-    int y =get_local_id(1);
+    int x = get_local_id(0);
+    int y = get_local_id(1);
 
-    int gx=get_global_id(0);
-    int gy=get_global_id(1);
+    int gx = get_global_id(0);
+    int gy = get_global_id(1);
 
     int dx = gx - x - 2;
     int dy = gy - y - 2;
 
-    const int id = min((int)(x*16+y), 10*20-1);
+    int id = min(mad24(x, 16, y), 10*20-1);
 
-    int dr=id/20;
-    int dc=id%20;
+    int dr = id / 20;
+    int dc = id % 20;
 
-    int c=clamp(dx+dc, 0, cols-1);
+    int c = clamp(dx + dc, 0, dst_cols - 1);
+    int r = clamp(dy + dr, 0, dst_rows - 1);
+    int index1 = mad24(r, src_step, mad24(c, TSIZE, src_offset));
 
-    int r = clamp(dy+dr, 0, rows-1);
-    int index1 = mad24(r, srcStep, srcOffset + c*scnbytes);
-
-    r = clamp(dy+dr+10, 0, rows-1);
-    int index10 = mad24(r, srcStep, srcOffset + c*scnbytes);
-
-    __global DATA_TYPE * src = (__global DATA_TYPE *)(srcptr + index1);
-    data[dr][dc] = src[0];
-    src = (__global DATA_TYPE *)(srcptr + index10);
-    data[dr+10][dc] = src[0];
+    r = clamp(dy + dr + 10, 0, dst_rows - 1);
+    int index10 = mad24(r, src_step, mad24(c, TSIZE, src_offset));
 
+    data[dr][dc] = loadpix(srcptr + index1);
+    data[dr+10][dc] = loadpix(srcptr + index10);
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    DATA_TYPE p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-    DATA_TYPE p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-    DATA_TYPE p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-    DATA_TYPE p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-    DATA_TYPE p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-    DATA_TYPE mid;
+    T p0 = data[y][x], p1 = data[y][x+1], p2 = data[y][x+2], p3 = data[y][x+3], p4 = data[y][x+4];
+    T p5 = data[y+1][x], p6 = data[y+1][x+1], p7 = data[y+1][x+2], p8 = data[y+1][x+3], p9 = data[y+1][x+4];
+    T p10 = data[y+2][x], p11 = data[y+2][x+1], p12 = data[y+2][x+2], p13 = data[y+2][x+3], p14 = data[y+2][x+4];
+    T p15 = data[y+3][x], p16 = data[y+3][x+1], p17 = data[y+3][x+2], p18 = data[y+3][x+3], p19 = data[y+3][x+4];
+    T p20 = data[y+4][x], p21 = data[y+4][x+1], p22 = data[y+4][x+2], p23 = data[y+4][x+3], p24 = data[y+4][x+4];
+    T mid;
 
     op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
     op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
@@ -157,11 +149,8 @@ __kernel void medianFilter5(__global const uchar* srcptr, int srcStep, int srcOf
     op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
     op(p7, p11); op(p11, p13); op(p11, p12);
 
-    int dst_index = mad24( gy, dstStep, dstOffset + gx * scnbytes);
+    int dst_index = mad24(gy, dst_step, mad24(gx, TSIZE, dst_offset));
 
-    if( gy < rows && gx < cols)
-    {
-        __global DATA_TYPE* dst = (__global DATA_TYPE *)(dstptr + dst_index);
-        dst[0] = p12;
-    }
+    if (gy < dst_rows && gx < dst_cols)
+        storepix(p12, dstptr + dst_index);
 }
\ No newline at end of file
diff --git a/modules/imgproc/src/opencl/morph.cl b/modules/imgproc/src/opencl/morph.cl
index cb6e733ed..fe11b4994 100644
--- a/modules/imgproc/src/opencl/morph.cl
+++ b/modules/imgproc/src/opencl/morph.cl
@@ -43,6 +43,16 @@
 #endif
 #endif
 
+#if cn != 3
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE ((int)sizeof(T1)*3)
+#endif
+
 #ifdef DEPTH_0
 #ifdef ERODE
 #define VAL 255
@@ -50,16 +60,14 @@
 #ifdef DILATE
 #define VAL 0
 #endif
-#endif
-#ifdef DEPTH_5
+#elif defined DEPTH_5
 #ifdef ERODE
 #define VAL FLT_MAX
 #endif
 #ifdef DILATE
 #define VAL -FLT_MAX
 #endif
-#endif
-#ifdef DEPTH_6
+#elif defined DEPTH_6
 #ifdef ERODE
 #define VAL DBL_MAX
 #endif
@@ -69,84 +77,80 @@
 #endif
 
 #ifdef ERODE
+#ifdef INTEL_DEVICE
+// workaround for bug in Intel HD graphics drivers (10.18.10.3496 or older)
+#define MORPH_OP(A,B) ((A) < (B) ? (A) : (B))
+#else
 #define MORPH_OP(A,B) min((A),(B))
 #endif
+#endif
 #ifdef DILATE
 #define MORPH_OP(A,B) max((A),(B))
 #endif
-//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
-#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
 
-__kernel void morph(__global const uchar * restrict srcptr, int src_step, int src_offset,
+// BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) < (l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+
+__kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,
                     __global uchar * dstptr, int dst_step, int dst_offset,
-                    int src_offset_x, int src_offset_y,
-                    int cols, int rows,
-                    __constant uchar * mat_kernel,
-                    int src_whole_cols, int src_whole_rows)
+                    int src_offset_x, int src_offset_y, int cols, int rows,
+                    __constant uchar * mat_kernel, int src_whole_cols, int src_whole_rows)
 {
-    int l_x = get_local_id(0);
-    int l_y = get_local_id(1);
-    int x = get_group_id(0)*LSIZE0;
-    int y = get_group_id(1)*LSIZE1;
-    int start_x = x+src_offset_x-RADIUSX;
-    int end_x = x + src_offset_x+LSIZE0+RADIUSX;
-    int width = end_x -(x+src_offset_x-RADIUSX)+1;
-    int start_y = y+src_offset_y-RADIUSY;
-    int point1 = mad24(l_y,LSIZE0,l_x);
-    int point2 = point1 + LSIZE0*LSIZE1;
-    int tl_x = point1 % width;
-    int tl_y = point1 / width;
-    int tl_x2 = point2 % width;
-    int tl_y2 = point2 / width;
-    int cur_x = start_x + tl_x;
-    int cur_y = start_y + tl_y;
-    int cur_x2 = start_x + tl_x2;
-    int cur_y2 = start_y + tl_y2;
-    int start_addr = mad24(cur_y,src_step, cur_x*(int)sizeof(GENTYPE));
-    int start_addr2 = mad24(cur_y2,src_step, cur_x2*(int)sizeof(GENTYPE));
-    GENTYPE temp0,temp1;
-    __local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
+    int gidx = get_global_id(0), gidy = get_global_id(1);
+    int l_x = get_local_id(0), l_y = get_local_id(1);
+    int x = get_group_id(0) * LSIZE0, y = get_group_id(1) * LSIZE1;
+    int start_x = x + src_offset_x - RADIUSX;
+    int end_x = x + src_offset_x + LSIZE0 + RADIUSX;
+    int width = end_x - (x + src_offset_x - RADIUSX) + 1;
+    int start_y = y + src_offset_y - RADIUSY;
+    int point1 = mad24(l_y, LSIZE0, l_x);
+    int point2 = point1 + LSIZE0 * LSIZE1;
+    int tl_x = point1 % width, tl_y = point1 / width;
+    int tl_x2 = point2 % width, tl_y2 = point2 / width;
+    int cur_x = start_x + tl_x, cur_y = start_y + tl_y;
+    int cur_x2 = start_x + tl_x2, cur_y2 = start_y + tl_y2;
+    int start_addr = mad24(cur_y, src_step, cur_x * TSIZE);
+    int start_addr2 = mad24(cur_y2, src_step, cur_x2 * TSIZE);
 
-    int end_addr = mad24(src_whole_rows - 1,src_step,src_whole_cols*(int)sizeof(GENTYPE));
-    //read pixels from src
-    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
-    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
-    __global const GENTYPE * src;
-    src = (__global const GENTYPE *)(srcptr+start_addr);
-    temp0 = src[0];
-    src = (__global const GENTYPE *)(srcptr+start_addr2);
-    temp1 = src[0];
-    //judge if read out of boundary
-    temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
-    temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
+    __local T LDS_DAT[2*LSIZE1*LSIZE0];
 
-    temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
-    temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
+    // read pixels from src
+    int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * TSIZE);
+    start_addr = start_addr < end_addr && start_addr > 0 ? start_addr : 0;
+    start_addr2 = start_addr2 < end_addr && start_addr2 > 0 ? start_addr2 : 0;
+
+    T temp0 = loadpix(srcptr + start_addr);
+    T temp1 = loadpix(srcptr + start_addr2);
+
+    // judge if read out of boundary
+    temp0 = ELEM(cur_x, 0, src_whole_cols, (T)(VAL),temp0);
+    temp0 = ELEM(cur_y, 0, src_whole_rows, (T)(VAL),temp0);
+
+    temp1 = ELEM(cur_x2, 0, src_whole_cols, (T)(VAL), temp1);
+    temp1 = ELEM(cur_y2, 0, src_whole_rows, (T)(VAL), temp1);
 
     LDS_DAT[point1] = temp0;
     LDS_DAT[point2] = temp1;
     barrier(CLK_LOCAL_MEM_FENCE);
-    GENTYPE res = (GENTYPE)VAL;
-    for(int i=0; i<2*RADIUSY+1; i++)
-        for(int j=0; j<2*RADIUSX+1; j++)
+
+    T res = (T)(VAL);
+    for (int i = 0, sizey = 2 * RADIUSY + 1; i < sizey; i++)
+        for (int j = 0, sizex = 2 * RADIUSX + 1; j < sizex; j++)
         {
             res =
 #ifndef RECTKERNEL
                 mat_kernel[i*(2*RADIUSX+1)+j] ?
 #endif
-                MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)])
+                MORPH_OP(res, LDS_DAT[mad24(l_y + i, width, l_x + j)])
 #ifndef RECTKERNEL
-                :res
+                : res
 #endif
                 ;
         }
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    if(gidx<cols && gidy<rows)
-    {
-        int dst_index = mad24(gidy, dst_step, dst_offset + gidx * (int)sizeof(GENTYPE));
-        __global GENTYPE * dst = (__global GENTYPE *)(dstptr + dst_index);
-        dst[0] = res;
-    }
 
+    if (gidx < cols && gidy < rows)
+    {
+        int dst_index = mad24(gidy, dst_step, mad24(gidx, TSIZE, dst_offset));
+        storepix(res, dstptr + dst_index);
+    }
 }
diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index a142d781c..d656bf6d1 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -43,110 +43,140 @@
 //
 //M*/
 
-#if defined DOUBLE_SUPPORT
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
 
-#define INTER_RESIZE_COEF_BITS 11
 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
 #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
 #define INC(x,l) min(x+1,l-1)
 
-
-#define noconvert(x) (x)
+#define noconvert
 
 #if cn != 3
-#define loadpix(addr)  *(__global const PIXTYPE*)(addr)
-#define storepix(val, addr)  *(__global PIXTYPE*)(addr) = val
-#define PIXSIZE ((int)sizeof(PIXTYPE))
+#define loadpix(addr)  *(__global const T *)(addr)
+#define storepix(val, addr)  *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
 #else
-#define loadpix(addr)  vload3(0, (__global const PIXTYPE1*)(addr))
-#define storepix(val, addr) vstore3(val, 0, (__global PIXTYPE1*)(addr))
-#define PIXSIZE ((int)sizeof(PIXTYPE1)*3)
+#define loadpix(addr)  vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE (int)sizeof(T1)*cn
 #endif
 
-#if defined INTER_LINEAR
+#ifdef INTER_LINEAR_INTEGER
 
-__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
-                       int srcrows, int srccols,
-                       __global uchar* dstptr, int dststep, int dstoffset,
-                       int dstrows, int dstcols,
+__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                       __global const uchar * buffer)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        __global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
+        __global const short * ialpha = (__global const short *)(yofs + dst_rows);
+        __global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
+        ialpha += dx << 1;
+
+        int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
+        sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
+        short a0 = ialpha[0], a1 = ialpha[1];
+        short b0 = ibeta[0], b1 = ibeta[1];
+
+        int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
+        src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
+        WT data0 = convertToWT(loadpix(srcptr + src_index0));
+        WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
+        WT data2 = convertToWT(loadpix(srcptr + src_index1));
+        WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
+
+        WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
+                 ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
+
+        storepix(convertToDT((val + 2) >> 2),
+                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
+    }
+}
+
+#elif defined INTER_LINEAR
+
+__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                        float ifx, float ify)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
 
-    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
-    int x = floor(sx), y = floor(sy);
+    if (dx < dst_cols && dy < dst_rows)
+    {
+        float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
+        int x = floor(sx), y = floor(sy);
 
-    float u = sx - x, v = sy - y;
+        float u = sx - x, v = sy - y;
 
-    if ( x<0 ) x=0,u=0;
-    if ( x>=srccols ) x=srccols-1,u=0;
-    if ( y<0 ) y=0,v=0;
-    if ( y>=srcrows ) y=srcrows-1,v=0;
+        if ( x<0 ) x=0,u=0;
+        if ( x>=src_cols ) x=src_cols-1,u=0;
+        if ( y<0 ) y=0,v=0;
+        if ( y>=src_rows ) y=src_rows-1,v=0;
 
-    int y_ = INC(y,srcrows);
-    int x_ = INC(x,srccols);
+        int y_ = INC(y, src_rows);
+        int x_ = INC(x, src_cols);
 
 #if depth <= 4
+        u = u * INTER_RESIZE_COEF_SCALE;
+        v = v * INTER_RESIZE_COEF_SCALE;
 
-    u = u * INTER_RESIZE_COEF_SCALE;
-    v = v * INTER_RESIZE_COEF_SCALE;
+        int U = rint(u);
+        int V = rint(v);
+        int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
+        int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
 
-    int U = rint(u);
-    int V = rint(v);
-    int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
-    int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
+        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
+        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
+        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
+        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
 
-    WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
-    WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
-
-    WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
-               mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
-
-    PIXTYPE uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
+        WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
+                   mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
 
+        T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
 #else
-    float u1 = 1.f - u;
-    float v1 = 1.f - v;
-    WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
-    WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
-
-    PIXTYPE uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
+        float u1 = 1.f - u;
+        float v1 = 1.f - v;
+        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
+        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
+        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
+        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
 
+        T uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
 #endif
-
-    if(dx < dstcols && dy < dstrows)
-    {
-        storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
+        storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
     }
 }
 
 #elif defined INTER_NEAREST
 
-__kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
-                       int srcrows, int srccols,
-                       __global uchar* dstptr, int dststep, int dstoffset,
-                       int dstrows, int dstcols,
+__kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                        float ifx, float ify)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
 
-    if( dx < dstcols && dy < dstrows )
+    if (dx < dst_cols && dy < dst_rows)
     {
-        float s1 = dx*ifx;
-        float s2 = dy*ify;
-        int sx = min(convert_int_rtz(s1), srccols-1);
-        int sy = min(convert_int_rtz(s2), srcrows-1);
+        float s1 = dx * ifx;
+        float s2 = dy * ify;
+        int sx = min(convert_int_rtz(s1), src_cols - 1);
+        int sy = min(convert_int_rtz(s2), src_rows - 1);
 
-        storepix(loadpix(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)),
-                 dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
+        storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),
+                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
     }
 }
 
@@ -179,10 +209,10 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
             int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
             #pragma unroll
             for (int x = 0; x < XSCALE; ++x)
-                sum += convertToWTV(loadpix(src + src_index + sxmap_tab[sx + x]*PIXSIZE));
+                sum += convertToWTV(loadpix(src + mad24(sxmap_tab[sx + x], TSIZE, src_index)));
         }
 
-        storepix(convertToPIXTYPE(convertToWT2V(sum) * (WT2V)(SCALE)), dst + dst_index + dx*PIXSIZE);
+        storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
     }
 }
 
@@ -224,12 +254,12 @@ __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offse
             for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
             {
                 WTV alpha = (WTV)(xalpha_tab[xk]);
-                buf += convertToWTV(loadpix(src + src_index + sx*PIXSIZE)) * alpha;
+                buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
             }
             sum += buf * beta;
         }
 
-        storepix(convertToPIXTYPE(sum), dst + dst_index + dx*PIXSIZE);
+        storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
     }
 }
 
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index 40687a226..6a18af5c2 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -639,7 +639,7 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
     if (ddepth < 0)
         ddepth = sdepth;
 
-    if (!(cn == 1 || cn == 2 || cn == 4) || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+    if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
         _src.offset() % esz != 0 || _src.step() % esz != 0)
         return false;
 
@@ -687,15 +687,17 @@ static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
             return false;
 
         char cvt[2][50];
-        String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s "
-                             "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s",
+        String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s"
+                             " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s"
+                             " -D ST1=%s -D DT1=%s -D cn=%d",
                              BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
                              anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
                              isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-                             normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "");
+                             normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "",
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
 
         localsize[0] = BLOCK_SIZE_X;
         globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
@@ -1902,35 +1904,27 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
 
 #ifdef HAVE_OPENCL
 
-static bool ocl_medianFilter ( InputArray _src, OutputArray _dst, int m)
+static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m)
 {
-    int type = _src.type();
-    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
-    if (!((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn != 3 && cn <= 4)))
+    if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) )
         return false;
 
-    const char * kernelName;
-
-    if (m == 3)
-        kernelName = "medianFilter3";
-    else if (m == 5)
-        kernelName = "medianFilter5";
-    else
-        return false;
-
-    ocl::Kernel k(kernelName,ocl::imgproc::medianFilter_oclsrc,format("-D type=%s",ocl::typeToStr(type)));
+    ocl::Kernel k(format("medianFilter%d", m).c_str(), ocl::imgproc::medianFilter_oclsrc,
+                  format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type),
+                         ocl::typeToStr(depth), cn));
     if (k.empty())
         return false;
 
     UMat src = _src.getUMat();
-    _dst.create(_src.size(),type);
+    _dst.create(src.size(), type);
     UMat dst = _dst.getUMat();
 
-    size_t globalsize[2] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16};
-    size_t localsize[2] = {16, 16};
+    k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
 
-    return k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)).run(2,globalsize,localsize,false);
+    size_t globalsize[2] = { (src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16}, localsize[2] = { 16, 16 };
+    return k.run(2, globalsize, localsize, false);
 }
 
 #endif
@@ -2210,10 +2204,10 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
                                    double sigma_color, double sigma_space,
                                    int borderType)
 {
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int i, j, maxk, radius;
 
-    if ( type != CV_8UC1 )
+    if (depth != CV_8U || cn > 4)
         return false;
 
     if (sigma_color <= 0)
@@ -2240,9 +2234,9 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
     std::vector<float> _color_weight(cn * 256);
     std::vector<float> _space_weight(d * d);
     std::vector<int> _space_ofs(d * d);
-    float *color_weight = &_color_weight[0];
-    float *space_weight = &_space_weight[0];
-    int *space_ofs = &_space_ofs[0];
+    float * const color_weight = &_color_weight[0];
+    float * const space_weight = &_space_weight[0];
+    int * const space_ofs = &_space_ofs[0];
 
     // initialize color-related bilateral filter coefficients
     for( i = 0; i < 256 * cn; i++ )
@@ -2256,11 +2250,19 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
             if ( r > radius )
                 continue;
             space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
-            space_ofs[maxk++] = (int)(i * temp.step + j);
+            space_ofs[maxk++] = (int)(i * temp.step + j * cn);
         }
 
+    char cvt[3][40];
+    String cnstr = cn > 1 ? format("%d", cn) : "";
     ocl::Kernel k("bilateral", ocl::imgproc::bilateral_oclsrc,
-                  format("-D radius=%d -D maxk=%d", radius, maxk));
+                  format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s"
+                         " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s",
+                         radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(),
+                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
+                         ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
+                         ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]),
+                         ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2])));
     if (k.empty())
         return false;
 
diff --git a/modules/imgproc/test/ocl/test_accumulate.cpp b/modules/imgproc/test/ocl/test_accumulate.cpp
index 586c34b26..50c9085cb 100644
--- a/modules/imgproc/test/ocl/test_accumulate.cpp
+++ b/modules/imgproc/test/ocl/test_accumulate.cpp
@@ -58,10 +58,10 @@ PARAM_TEST_CASE(AccumulateBase, std::pair<MatDepth, MatDepth>, Channels, bool)
     bool useRoi;
     double alpha;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_INPUT_PARAMETER(mask)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_INPUT_PARAMETER(mask);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -90,10 +90,10 @@ PARAM_TEST_CASE(AccumulateBase, std::pair<MatDepth, MatDepth>, Channels, bool)
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, -MAX_VALUE, MAX_VALUE);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_INPUT_PARAMETER(mask)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_INPUT_PARAMETER(mask);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
 
         alpha = randomDouble(-5, 5);
     }
diff --git a/modules/imgproc/test/ocl/test_blend.cpp b/modules/imgproc/test/ocl/test_blend.cpp
index 17c0b1312..7b62b9717 100644
--- a/modules/imgproc/test/ocl/test_blend.cpp
+++ b/modules/imgproc/test/ocl/test_blend.cpp
@@ -57,11 +57,11 @@ PARAM_TEST_CASE(BlendLinear, MatDepth, Channels, bool)
     int depth, channels;
     bool useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(weights2)
-    TEST_DECLARE_INPUT_PARAMETER(weights1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src1);
+    TEST_DECLARE_INPUT_PARAMETER(src2);
+    TEST_DECLARE_INPUT_PARAMETER(weights2);
+    TEST_DECLARE_INPUT_PARAMETER(weights1);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -89,22 +89,22 @@ PARAM_TEST_CASE(BlendLinear, MatDepth, Channels, bool)
         randomSubMat(weights2, weights2_roi, roiSize, weights2Border, CV_32FC1, 1e-2, upValue);
 
         weights2_roi -= weights1_roi;
-        CV_Assert(checkNorm(weights2_roi, weights2(Rect(weights2Border.lef, weights2Border.top,
+        CV_Assert(checkNorm2(weights2_roi, weights2(Rect(weights2Border.lef, weights2Border.top,
                                                         roiSize.width, roiSize.height))) < 1e-6);
 
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(weights1)
-        UMAT_UPLOAD_INPUT_PARAMETER(weights2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src1);
+        UMAT_UPLOAD_INPUT_PARAMETER(src2);
+        UMAT_UPLOAD_INPUT_PARAMETER(weights1);
+        UMAT_UPLOAD_INPUT_PARAMETER(weights2);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double eps = 0.0)
     {
-        OCL_EXPECT_MATS_NEAR(dst, eps)
+        OCL_EXPECT_MATS_NEAR(dst, eps);
     }
 };
 
diff --git a/modules/imgproc/test/ocl/test_boxfilter.cpp b/modules/imgproc/test/ocl/test_boxfilter.cpp
index c95657c9e..63f4ebff2 100644
--- a/modules/imgproc/test/ocl/test_boxfilter.cpp
+++ b/modules/imgproc/test/ocl/test_boxfilter.cpp
@@ -61,8 +61,8 @@ PARAM_TEST_CASE(BoxFilterBase, MatDepth, Channels, BorderType, bool, bool)
     Point anchor;
     bool normalize, useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -76,7 +76,6 @@ PARAM_TEST_CASE(BoxFilterBase, MatDepth, Channels, BorderType, bool, bool)
     void random_roi()
     {
         int type = CV_MAKE_TYPE(depth, cn);
-        dsize = randomSize(1, MAX_VALUE);
         ksize = randomSize(kernelMinSize, kernelMaxSize);
 
         Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
@@ -84,18 +83,18 @@ PARAM_TEST_CASE(BoxFilterBase, MatDepth, Channels, BorderType, bool, bool)
         randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
 
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
 
         anchor.x = randomInt(-1, ksize.width);
         anchor.y = randomInt(-1, ksize.height);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -134,7 +133,7 @@ OCL_TEST_P(SqrBoxFilter, Mat)
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, BoxFilter,
                             Combine(
                                 Values(CV_8U, CV_16U, CV_16S, CV_32S, CV_32F),
-                                Values(1, 2, 4),
+                                OCL_ALL_CHANNELS,
                                 Values((BorderType)BORDER_CONSTANT,
                                        (BorderType)BORDER_REPLICATE,
                                        (BorderType)BORDER_REFLECT,
@@ -147,7 +146,7 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, BoxFilter,
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SqrBoxFilter,
                             Combine(
                                 Values(CV_8U, CV_16U, CV_16S, CV_32F, CV_64F),
-                                Values(1, 2, 4),
+                                OCL_ALL_CHANNELS,
                                 Values((BorderType)BORDER_CONSTANT,
                                        (BorderType)BORDER_REPLICATE,
                                        (BorderType)BORDER_REFLECT,
diff --git a/modules/imgproc/test/ocl/test_canny.cpp b/modules/imgproc/test/ocl/test_canny.cpp
index e328d2a2f..631fe5bd1 100644
--- a/modules/imgproc/test/ocl/test_canny.cpp
+++ b/modules/imgproc/test/ocl/test_canny.cpp
@@ -58,29 +58,29 @@ IMPLEMENT_PARAM_CLASS(AppertureSize, int)
 IMPLEMENT_PARAM_CLASS(L2gradient, bool)
 IMPLEMENT_PARAM_CLASS(UseRoi, bool)
 
-PARAM_TEST_CASE(Canny, AppertureSize, L2gradient, UseRoi)
+PARAM_TEST_CASE(Canny, Channels, AppertureSize, L2gradient, UseRoi)
 {
-    int apperture_size;
+    int cn, apperture_size;
     bool useL2gradient, use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
-        apperture_size = GET_PARAM(0);
-        useL2gradient = GET_PARAM(1);
-        use_roi = GET_PARAM(2);
+        cn = GET_PARAM(0);
+        apperture_size = GET_PARAM(1);
+        useL2gradient = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
     }
 
     void generateTestData()
     {
-        Mat img = readImage("shared/fruits.png", IMREAD_GRAYSCALE);
+        Mat img = readImageType("shared/fruits.png", CV_8UC(cn));
         ASSERT_FALSE(img.empty()) << "cann't load shared/fruits.png";
 
         Size roiSize = img.size();
         int type = img.type();
-        ASSERT_EQ(CV_8UC1, type);
 
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 2, 100);
@@ -89,8 +89,8 @@ PARAM_TEST_CASE(Canny, AppertureSize, L2gradient, UseRoi)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -108,6 +108,7 @@ OCL_TEST_P(Canny, Accuracy)
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
+                                testing::Values(1, 3),
                                 testing::Values(AppertureSize(3), AppertureSize(5)),
                                 testing::Values(L2gradient(false), L2gradient(true)),
                                 testing::Values(UseRoi(false), UseRoi(true))));
diff --git a/modules/imgproc/test/ocl/test_color.cpp b/modules/imgproc/test/ocl/test_color.cpp
index ffd392a03..fcf270f8e 100644
--- a/modules/imgproc/test/ocl/test_color.cpp
+++ b/modules/imgproc/test/ocl/test_color.cpp
@@ -59,8 +59,8 @@ PARAM_TEST_CASE(CvtColor, MatDepth, bool)
     int depth;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -80,13 +80,13 @@ PARAM_TEST_CASE(CvtColor, MatDepth, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 
     void performTest(int channelsIn, int channelsOut, int code, double threshold = 1e-3)
@@ -287,8 +287,8 @@ struct CvtColor_YUV420 :
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
diff --git a/modules/imgproc/test/ocl/test_filter2d.cpp b/modules/imgproc/test/ocl/test_filter2d.cpp
index 54d65459c..222990529 100644
--- a/modules/imgproc/test/ocl/test_filter2d.cpp
+++ b/modules/imgproc/test/ocl/test_filter2d.cpp
@@ -62,9 +62,10 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
     int borderType;
     bool useRoi;
     Mat kernel;
+    double delta;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -91,8 +92,10 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
         anchor.x = randomInt(-1, ksize.width);
         anchor.y = randomInt(-1, ksize.height);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        delta = randomDouble(-100, 100);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
@@ -108,18 +111,17 @@ OCL_TEST_P(Filter2D, Mat)
     {
         random_roi();
 
-        OCL_OFF(cv::filter2D(src_roi, dst_roi, -1, kernel, anchor, 0.0, borderType));
-        OCL_ON(cv::filter2D(usrc_roi, udst_roi, -1, kernel, anchor, 0.0, borderType));
+        OCL_OFF(cv::filter2D(src_roi, dst_roi, -1, kernel, anchor, delta, borderType));
+        OCL_ON(cv::filter2D(usrc_roi, udst_roi, -1, kernel, anchor, delta, borderType));
 
         Near(1.0);
     }
 }
 
-
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, Filter2D,
                             Combine(
-                                Values(CV_8U, CV_16U, CV_16S, CV_32F, CV_64F),
-                                Values(1, 2, 4),
+                                Values(CV_8U, CV_16U, CV_32F),
+                                OCL_ALL_CHANNELS,
                                 Values((BorderType)BORDER_CONSTANT,
                                        (BorderType)BORDER_REPLICATE,
                                        (BorderType)BORDER_REFLECT,
diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp
index fe16fe81d..d2f508516 100644
--- a/modules/imgproc/test/ocl/test_filters.cpp
+++ b/modules/imgproc/test/ocl/test_filters.cpp
@@ -69,8 +69,8 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
     double param;
     bool useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -94,8 +94,8 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -60, 70);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near()
@@ -112,15 +112,9 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
     void Near(double threshold, bool relative)
     {
         if (relative)
-        {
-            EXPECT_MAT_NEAR_RELATIVE(dst, udst, threshold);
-            EXPECT_MAT_NEAR_RELATIVE(dst_roi, udst_roi, threshold);
-        }
+            OCL_EXPECT_MATS_NEAR_RELATIVE(dst, threshold);
         else
-        {
-            EXPECT_MAT_NEAR(dst, udst, threshold);
-            EXPECT_MAT_NEAR(dst_roi, udst_roi, threshold);
-        }
+            OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -158,8 +152,8 @@ OCL_TEST_P(LaplacianTest, Accuracy)
     {
         random_roi();
 
-        OCL_OFF(cv::Laplacian(src_roi, dst_roi, -1, ksize, scale, 0, borderType));
-        OCL_ON(cv::Laplacian(usrc_roi, udst_roi, -1, ksize, scale, 0, borderType));
+        OCL_OFF(cv::Laplacian(src_roi, dst_roi, -1, ksize, scale, 10, borderType));
+        OCL_ON(cv::Laplacian(usrc_roi, udst_roi, -1, ksize, scale, 10, borderType));
 
         Near();
     }
@@ -296,8 +290,6 @@ OCL_TEST_P(MorphologyEx, Mat)
     }
 }
 
-
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 #define FILTER_BORDER_SET_NO_ISOLATED \
@@ -312,10 +304,10 @@ OCL_TEST_P(MorphologyEx, Mat)
             (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
             (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
 
-#define FILTER_TYPES Values(CV_8UC1, CV_8UC2, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4)
+#define FILTER_TYPES Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4)
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
-                            Values((MatType)CV_8UC1),
+                            Values(CV_8UC1, CV_8UC3),
                             Values(5, 9), // kernel size
                             Values(Size(0, 0)), // not used
                             FILTER_BORDER_SET_NO_ISOLATED,
@@ -324,7 +316,7 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
                             FILTER_TYPES,
-                            Values(1, 3), // kernel size
+                            Values(1, 3, 5), // kernel size
                             Values(Size(0, 0)), // not used
                             FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
                             Values(1.0, 0.2, 3.0), // kernel scale
@@ -355,29 +347,28 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
                             Bool()));
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4),
+                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
                             Values(3, 5, 7),
                             Values(Size(0,0)),//not used
                             Values((BorderType)BORDER_CONSTANT),//not used
                             Values(1.0, 2.0, 3.0),
-                            Bool() ) );
+                            Bool()));
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4),
+                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
                             Values(3, 5, 7),
                             Values(Size(0,0)),//not used
                             Values((BorderType)BORDER_CONSTANT),//not used
                             Values(1.0, 2.0, 3.0),
-                            Bool() ) );
+                            Bool()));
 
 OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4),
+                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4, CV_64FC1, CV_64FC4),
                             Values(3, 5, 7),
-                            Values(Size(0,0), Size(0,1), Size(0,2), Size(0,3), Size(0,4), Size(0,5),Size(0,6)),//uses as generator of operations
-                            Values((BorderType)BORDER_CONSTANT),//not used
+                            Values(Size(0, 0), Size(0, 1), Size(0, 2), Size(0, 3), Size(0, 4), Size(0, 5), Size(0, 6)), // used as generator of operations
+                            Values((BorderType)BORDER_CONSTANT),// not used
                             Values(1.0, 2.0, 3.0),
-                            Bool() ) );
-
+                            Bool()));
 
 } } // namespace cvtest::ocl
 
diff --git a/modules/imgproc/test/ocl/test_gftt.cpp b/modules/imgproc/test/ocl/test_gftt.cpp
index e47997623..6e65f90dd 100644
--- a/modules/imgproc/test/ocl/test_gftt.cpp
+++ b/modules/imgproc/test/ocl/test_gftt.cpp
@@ -60,7 +60,7 @@ PARAM_TEST_CASE(GoodFeaturesToTrack, double, bool)
     static const int maxCorners;
     static const double qualityLevel;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_INPUT_PARAMETER(src);
     UMat points, upoints;
 
     virtual void SetUp()
@@ -79,7 +79,7 @@ PARAM_TEST_CASE(GoodFeaturesToTrack, double, bool)
         randomSubMat(src, src_roi, roiSize, srcBorder, frame.type(), 5, 256);
         src_roi.copyTo(frame);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
     }
 
     void UMatToVector(const UMat & um, std::vector<Point2f> & v) const
diff --git a/modules/imgproc/test/ocl/test_histogram.cpp b/modules/imgproc/test/ocl/test_histogram.cpp
index b0837eeaa..0a27907b6 100644
--- a/modules/imgproc/test/ocl/test_histogram.cpp
+++ b/modules/imgproc/test/ocl/test_histogram.cpp
@@ -76,8 +76,8 @@ PARAM_TEST_CASE(CalcBackProject, MatDepth, int, bool)
     std::vector<UMat> uimages;
     std::vector<UMat> uimages_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(hist)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(hist);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -139,8 +139,8 @@ PARAM_TEST_CASE(CalcBackProject, MatDepth, int, bool)
             uimages_roi[i] = uimages[i](Rect(ofs.x, ofs.y, images_roi[i].cols, images_roi[i].rows));
         }
 
-        UMAT_UPLOAD_INPUT_PARAMETER(hist)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(hist);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
 
         scale = randomDouble(0.1, 1);
     }
@@ -157,7 +157,7 @@ OCL_TEST_P(CalcBackProject, Mat)
         OCL_OFF(cv::calcBackProject(images_roi, channels, hist_roi, dst_roi, ranges, scale));
         OCL_ON(cv::calcBackProject(uimages_roi, channels, uhist_roi, udst_roi, ranges, scale));
 
-        OCL_EXPECT_MATS_NEAR(dst, 0.0)
+        OCL_EXPECT_MATS_NEAR(dst, 0.0);
     }
 }
 
@@ -167,8 +167,8 @@ PARAM_TEST_CASE(CalcHist, bool)
 {
     bool useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(hist)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(hist);
 
     virtual void SetUp()
     {
@@ -185,8 +185,8 @@ PARAM_TEST_CASE(CalcHist, bool)
         Border histBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(hist, hist_roi, Size(1, 256), histBorder, CV_32SC1, 0, MAX_VALUE);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(hist)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(hist);
     }
 };
 
@@ -205,7 +205,7 @@ OCL_TEST_P(CalcHist, Mat)
         OCL_OFF(cv::calcHist(std::vector<Mat>(1, src_roi), channels, noArray(), hist_roi, histSize, ranges, false));
         OCL_ON(cv::calcHist(std::vector<UMat>(1, usrc_roi), channels, noArray(), uhist_roi, histSize, ranges, false));
 
-        OCL_EXPECT_MATS_NEAR(hist, 0.0)
+        OCL_EXPECT_MATS_NEAR(hist, 0.0);
     }
 }
 
diff --git a/modules/imgproc/test/ocl/test_imgproc.cpp b/modules/imgproc/test/ocl/test_imgproc.cpp
index 78b2e573d..ad8e26cbc 100644
--- a/modules/imgproc/test/ocl/test_imgproc.cpp
+++ b/modules/imgproc/test/ocl/test_imgproc.cpp
@@ -70,8 +70,8 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
     int type, borderType, blockSize;
     bool useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -90,16 +90,16 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0, bool relative = false)
     {
         if (relative)
-            OCL_EXPECT_MATS_NEAR_RELATIVE(dst, threshold)
+            OCL_EXPECT_MATS_NEAR_RELATIVE(dst, threshold);
         else
-            OCL_EXPECT_MATS_NEAR(dst, threshold)
+            OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -117,8 +117,8 @@ PARAM_TEST_CASE(CopyMakeBorder, MatDepth, // depth
     TestUtils::Border border;
     Scalar val;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -148,13 +148,13 @@ PARAM_TEST_CASE(CopyMakeBorder, MatDepth, // depth
 
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near()
     {
-        OCL_EXPECT_MATS_NEAR(dst, 0)
+        OCL_EXPECT_MATS_NEAR(dst, 0);
     }
 };
 
@@ -217,8 +217,8 @@ struct CornerTestBase :
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_32FC1, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -286,7 +286,7 @@ struct Integral :
 {
     int sdepth, sqdepth;
 
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2);
 
     virtual void SetUp()
     {
@@ -310,17 +310,17 @@ struct Integral :
         Border dst2Border = randomBorder(0, useRoi ? 2 : 0);
         randomSubMat(dst2, dst2_roi, isize, dst2Border, sqdepth, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2);
     }
 
     void Near2(double threshold = 0.0, bool relative = false)
     {
         if (relative)
-            OCL_EXPECT_MATS_NEAR_RELATIVE(dst2, threshold)
+            OCL_EXPECT_MATS_NEAR_RELATIVE(dst2, threshold);
         else
-            OCL_EXPECT_MATS_NEAR(dst2, threshold)
+            OCL_EXPECT_MATS_NEAR(dst2, threshold);
     }
 };
 
@@ -390,8 +390,8 @@ PARAM_TEST_CASE(CLAHETest, Size, double, bool)
     double clipLimit;
     bool useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -409,13 +409,13 @@ PARAM_TEST_CASE(CLAHETest, Size, double, bool)
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_8UC1, 5, 16);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
diff --git a/modules/imgproc/test/ocl/test_match_template.cpp b/modules/imgproc/test/ocl/test_match_template.cpp
index c283b8c0b..f0a61302d 100644
--- a/modules/imgproc/test/ocl/test_match_template.cpp
+++ b/modules/imgproc/test/ocl/test_match_template.cpp
@@ -62,9 +62,9 @@ PARAM_TEST_CASE(MatchTemplate, MatDepth, Channels, MatchTemplType, bool)
     int method;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(image)
-    TEST_DECLARE_INPUT_PARAMETER(templ)
-    TEST_DECLARE_OUTPUT_PARAMETER(result)
+    TEST_DECLARE_INPUT_PARAMETER(image);
+    TEST_DECLARE_INPUT_PARAMETER(templ);
+    TEST_DECLARE_OUTPUT_PARAMETER(result);
 
     virtual void SetUp()
     {
@@ -92,9 +92,9 @@ PARAM_TEST_CASE(MatchTemplate, MatDepth, Channels, MatchTemplType, bool)
         Border resultBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(result, result_roi, result_roiSize, resultBorder, CV_32FC1, -upValue, upValue);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(image)
-        UMAT_UPLOAD_INPUT_PARAMETER(templ)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(result)
+        UMAT_UPLOAD_INPUT_PARAMETER(image);
+        UMAT_UPLOAD_INPUT_PARAMETER(templ);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(result);
     }
 
     void Near(double threshold = 0.0)
diff --git a/modules/imgproc/test/ocl/test_medianfilter.cpp b/modules/imgproc/test/ocl/test_medianfilter.cpp
index 011619726..6015ed1d7 100644
--- a/modules/imgproc/test/ocl/test_medianfilter.cpp
+++ b/modules/imgproc/test/ocl/test_medianfilter.cpp
@@ -57,8 +57,8 @@ PARAM_TEST_CASE(MedianFilter, MatDepth, Channels, int, bool)
     int ksize;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -76,8 +76,8 @@ PARAM_TEST_CASE(MedianFilter, MatDepth, Channels, int, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
@@ -102,7 +102,7 @@ OCL_TEST_P(MedianFilter, Mat)
 
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, MedianFilter, Combine(
                                 Values(CV_8U, CV_16U, CV_16S, CV_32F),
-                                Values(1, 2, 4),
+                                OCL_ALL_CHANNELS,
                                 Values(3, 5),
                                 Bool())
                            );
diff --git a/modules/imgproc/test/ocl/test_pyramids.cpp b/modules/imgproc/test/ocl/test_pyramids.cpp
index d6174a5b7..113349b30 100644
--- a/modules/imgproc/test/ocl/test_pyramids.cpp
+++ b/modules/imgproc/test/ocl/test_pyramids.cpp
@@ -57,8 +57,8 @@ PARAM_TEST_CASE(PyrTestBase, MatDepth, Channels, bool)
     int depth, channels;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -75,8 +75,8 @@ PARAM_TEST_CASE(PyrTestBase, MatDepth, Channels, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, dst_roiSize, dstBorder, CV_MAKETYPE(depth, channels), -MAX_VALUE, MAX_VALUE);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
diff --git a/modules/imgproc/test/ocl/test_sepfilter2D.cpp b/modules/imgproc/test/ocl/test_sepfilter2D.cpp
index 5e824d6b2..f7a18aae1 100644
--- a/modules/imgproc/test/ocl/test_sepfilter2D.cpp
+++ b/modules/imgproc/test/ocl/test_sepfilter2D.cpp
@@ -61,9 +61,10 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
     int borderType;
     bool useRoi;
     Mat kernelX, kernelY;
+    double delta;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -75,36 +76,28 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
     void random_roi()
     {
         Size ksize = randomSize(kernelMinSize, kernelMaxSize);
-        if (1 != (ksize.width % 2))
+        if (1 != ksize.width % 2)
             ksize.width++;
-        if (1 != (ksize.height % 2))
+        if (1 != ksize.height % 2)
             ksize.height++;
+
         Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
         cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
         temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
         cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
 
         Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
-        int rest = roiSize.width % 4;
-        if (0 != rest)
-            roiSize.width += (4 - rest);
         Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        rest = srcBorder.lef % 4;
-        if (0 != rest)
-            srcBorder.lef += (4 - rest);
-        rest = srcBorder.rig % 4;
-        if (0 != rest)
-            srcBorder.rig += (4 - rest);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
 
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
 
-        anchor.x = -1;
-        anchor.y = -1;
+        anchor.x = anchor.y = -1;
+        delta = randomDouble(-100, 100);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
@@ -115,22 +108,21 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
 
 OCL_TEST_P(SepFilter2D, Mat)
 {
-    for (int j = 0; j < test_loop_times; j++)
+    for (int j = 0; j < test_loop_times + 3; j++)
     {
         random_roi();
 
-        OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, 0.0, borderType));
-        OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, 0.0, borderType));
+        OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
+        OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
 
         Near(1.0);
     }
 }
 
-
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
                             Combine(
                                 Values(CV_8U, CV_32F),
-                                Values(1, 4),
+                                OCL_ALL_CHANNELS,
                                 Values(
                                         (BorderType)BORDER_CONSTANT,
                                         (BorderType)BORDER_REPLICATE,
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index 3963b671a..f9ccef8c6 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -73,8 +73,8 @@ PARAM_TEST_CASE(WarpTestBase, MatType, Interpolation, bool, bool)
     Size dsize;
     bool useRoi, mapInverse;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -98,13 +98,13 @@ PARAM_TEST_CASE(WarpTestBase, MatType, Interpolation, bool, bool)
         Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 
     void Near(double threshold = 0.0)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -164,8 +164,8 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
     double fx, fy;
     bool useRoi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -202,7 +202,7 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
 
     void Near(double threshold = 0.0)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -210,12 +210,15 @@ OCL_TEST_P(Resize, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
+        int depth = CV_MAT_DEPTH(type);
+        double eps = depth <= CV_32S ? 1 : 1e-2;
+
         random_roi();
 
         OCL_OFF(cv::resize(src_roi, dst_roi, Size(), fx, fy, interpolation));
         OCL_ON(cv::resize(usrc_roi, udst_roi, Size(), fx, fy, interpolation));
 
-        Near(1.0);
+        Near(eps);
     }
 }
 
@@ -230,10 +233,10 @@ PARAM_TEST_CASE(Remap, MatDepth, Channels, std::pair<MatType, MatType>, BorderTy
 
     Scalar val;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_INPUT_PARAMETER(map1)
-    TEST_DECLARE_INPUT_PARAMETER(map2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_INPUT_PARAMETER(map1);
+    TEST_DECLARE_INPUT_PARAMETER(map2);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -269,16 +272,16 @@ PARAM_TEST_CASE(Remap, MatDepth, Channels, std::pair<MatType, MatType>, BorderTy
             randomSubMat(map2, map2_roi, dstROISize, map2Border, map2Type, mapMinValue, mapMaxValue);
         }
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_INPUT_PARAMETER(map1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_INPUT_PARAMETER(map1);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
         if (noType != map2Type)
-            UMAT_UPLOAD_INPUT_PARAMETER(map2)
+            UMAT_UPLOAD_INPUT_PARAMETER(map2);
     }
 
     void Near(double threshold = 0.0)
     {
-        OCL_EXPECT_MATS_NEAR(dst, threshold)
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
     }
 };
 
@@ -328,8 +331,8 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpPerspective, Combine(
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Resize, Combine(
                             Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, CV_32FC4),
-                            Values(0.5, 1.5, 2.0),
-                            Values(0.5, 1.5, 2.0),
+                            Values(0.5, 1.5, 2.0, 0.2),
+                            Values(0.5, 1.5, 2.0, 0.2),
                             Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
                             Bool()));
 
diff --git a/modules/java/generator/rst_parser.py b/modules/java/generator/rst_parser.py
index f2363169b..750d6f0be 100755
--- a/modules/java/generator/rst_parser.py
+++ b/modules/java/generator/rst_parser.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import os, sys, re, string, fnmatch
 allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "cuda", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "softcascade", "superres"]
 verbose = False
@@ -141,10 +142,10 @@ class RstParser(object):
     def parse_section_safe(self, module_name, section_name, file_name, lineno, lines):
         try:
             self.parse_section(module_name, section_name, file_name, lineno, lines)
-        except AssertionError, args:
+        except AssertionError as args:
             if show_errors:
-                print >> sys.stderr, "RST parser error E%03d: assertion in \"%s\" at %s:%s" % (ERROR_001_SECTIONFAILURE, section_name, file_name, lineno)
-                print >> sys.stderr, "    Details: %s" % args
+                print("RST parser error E%03d: assertion in \"%s\" at %s:%s" % (ERROR_001_SECTIONFAILURE, section_name, file_name, lineno), file=sys.stderr)
+                print("    Details: %s" % args, file=sys.stderr)
 
     def parse_section(self, module_name, section_name, file_name, lineno, lines):
         self.sections_total += 1
@@ -152,7 +153,7 @@ class RstParser(object):
         #if section_name.find(" ") >= 0 and section_name.find("::operator") < 0:
         if (section_name.find(" ") >= 0 and not bool(re.match(r"(\w+::)*operator\s*(\w+|>>|<<|\(\)|->|\+\+|--|=|==|\+=|-=)", section_name)) ) or section_name.endswith(":"):
             if show_errors:
-                print >> sys.stderr, "RST parser warning W%03d:  SKIPPED: \"%s\" File: %s:%s" % (WARNING_002_HDRWHITESPACE, section_name, file_name, lineno)
+                print("RST parser warning W%03d:  SKIPPED: \"%s\" File: %s:%s" % (WARNING_002_HDRWHITESPACE, section_name, file_name, lineno), file=sys.stderr)
             self.sections_skipped += 1
             return
 
@@ -311,7 +312,7 @@ class RstParser(object):
 
         if fdecl.balance != 0:
             if show_critical_errors:
-                print >> sys.stderr, "RST parser error E%03d: invalid parentheses balance in \"%s\" at %s:%s" % (ERROR_003_PARENTHESES, section_name, file_name, lineno)
+                print("RST parser error E%03d: invalid parentheses balance in \"%s\" at %s:%s" % (ERROR_003_PARENTHESES, section_name, file_name, lineno), file=sys.stderr)
             return
 
         # save last parameter if needed
@@ -328,7 +329,7 @@ class RstParser(object):
         elif func:
             if func["name"] in known_text_sections_names:
                 if show_errors:
-                    print >> sys.stderr, "RST parser warning W%03d:  SKIPPED: \"%s\" File: %s:%s" % (WARNING_002_HDRWHITESPACE, section_name, file_name, lineno)
+                    print("RST parser warning W%03d:  SKIPPED: \"%s\" File: %s:%s" % (WARNING_002_HDRWHITESPACE, section_name, file_name, lineno), file=sys.stderr)
                 self.sections_skipped += 1
             elif show_errors:
                 self.print_info(func, True, sys.stderr)
@@ -351,7 +352,7 @@ class RstParser(object):
             if l.find("\t") >= 0:
                 whitespace_warnings += 1
                 if whitespace_warnings <= max_whitespace_warnings and show_warnings:
-                    print >> sys.stderr, "RST parser warning W%03d: tab symbol instead of space is used at %s:%s" % (WARNING_004_TABS, doc, lineno)
+                    print("RST parser warning W%03d: tab symbol instead of space is used at %s:%s" % (WARNING_004_TABS, doc, lineno), file=sys.stderr)
                 l = l.replace("\t", "    ")
 
             # handle first line
@@ -388,8 +389,8 @@ class RstParser(object):
 
     def add_new_fdecl(self, func, decl):
         if decl.fdecl.endswith(";"):
-            print >> sys.stderr, "RST parser error E%03d: unexpected semicolon at the end of declaration in \"%s\" at %s:%s" \
-                        % (ERROR_011_EOLEXPECTED, func["name"], func["file"], func["line"])
+            print("RST parser error E%03d: unexpected semicolon at the end of declaration in \"%s\" at %s:%s" \
+                        % (ERROR_011_EOLEXPECTED, func["name"], func["file"], func["line"]), file=sys.stderr)
         decls =  func.get("decls", [])
         if (decl.lang == "C++" or decl.lang == "C"):
             rst_decl = self.cpp_parser.parse_func_decl_no_wrap(decl.fdecl)
@@ -405,37 +406,37 @@ class RstParser(object):
             if show_errors:
                 #check black_list
                 if decl.name not in params_blacklist.get(func["name"], []):
-                    print >> sys.stderr, "RST parser error E%03d: redefinition of parameter \"%s\" in \"%s\" at %s:%s" \
-                        % (ERROR_005_REDEFENITIONPARAM, decl.name, func["name"], func["file"], func["line"])
+                    print("RST parser error E%03d: redefinition of parameter \"%s\" in \"%s\" at %s:%s" \
+                        % (ERROR_005_REDEFENITIONPARAM, decl.name, func["name"], func["file"], func["line"]), file=sys.stderr)
         else:
             params[decl.name] = decl.comment
             func["params"] = params
 
     def print_info(self, func, skipped=False, out = sys.stdout):
-        print >> out
+        print(file=out)
         if skipped:
-            print >> out, "SKIPPED DEFINITION:"
-        print >> out, "name:      %s" % (func.get("name","~empty~"))
-        print >> out, "file:      %s:%s" % (func.get("file","~empty~"), func.get("line","~empty~"))
-        print >> out, "is class:  %s" % func.get("isclass", False)
-        print >> out, "is struct: %s" % func.get("isstruct", False)
-        print >> out, "module:    %s" % func.get("module","~unknown~")
-        print >> out, "namespace: %s" % func.get("namespace", "~empty~")
-        print >> out, "class:     %s" % (func.get("class","~empty~"))
-        print >> out, "method:    %s" % (func.get("method","~empty~"))
-        print >> out, "brief:     %s" % (func.get("brief","~empty~"))
+            print("SKIPPED DEFINITION:", file=out)
+        print("name:      %s" % (func.get("name","~empty~")), file=out)
+        print("file:      %s:%s" % (func.get("file","~empty~"), func.get("line","~empty~")), file=out)
+        print("is class:  %s" % func.get("isclass", False), file=out)
+        print("is struct: %s" % func.get("isstruct", False), file=out)
+        print("module:    %s" % func.get("module","~unknown~"), file=out)
+        print("namespace: %s" % func.get("namespace", "~empty~"), file=out)
+        print("class:     %s" % (func.get("class","~empty~")), file=out)
+        print("method:    %s" % (func.get("method","~empty~")), file=out)
+        print("brief:     %s" % (func.get("brief","~empty~")), file=out)
         if "decls" in func:
-            print >> out, "declarations:"
+            print("declarations:", file=out)
             for d in func["decls"]:
-                print >> out, "     %7s: %s" % (d[0], re.sub(r"[ ]+", " ", d[1]))
+                print("     %7s: %s" % (d[0], re.sub(r"[ ]+", " ", d[1])), file=out)
         if "seealso" in func:
-            print >> out, "seealso:  ", func["seealso"]
+            print("seealso:  ", func["seealso"], file=out)
         if "params" in func:
-            print >> out, "parameters:"
+            print("parameters:", file=out)
             for name, comment in func["params"].items():
-                print >> out, "%23s:   %s" % (name, comment)
-        print >> out, "long:      %s" % (func.get("long","~empty~"))
-        print >> out
+                print("%23s:   %s" % (name, comment), file=out)
+        print("long:      %s" % (func.get("long","~empty~")), file=out)
+        print(file=out)
 
     def validate(self, func):
         if func.get("decls", None) is None:
@@ -443,13 +444,13 @@ class RstParser(object):
                 return False
         if func["name"] in self.definitions:
             if show_errors:
-                print >> sys.stderr, "RST parser error E%03d: \"%s\" from: %s:%s is already documented at %s:%s" \
-                    % (ERROR_006_REDEFENITIONFUNC, func["name"], func["file"], func["line"], self.definitions[func["name"]]["file"], self.definitions[func["name"]]["line"])
+                print("RST parser error E%03d: \"%s\" from: %s:%s is already documented at %s:%s" \
+                    % (ERROR_006_REDEFENITIONFUNC, func["name"], func["file"], func["line"], self.definitions[func["name"]]["file"], self.definitions[func["name"]]["line"]), file=sys.stderr)
             return False
         return self.validateParams(func)
 
     def validateParams(self, func):
-        documentedParams = func.get("params", {}).keys()
+        documentedParams = list(func.get("params", {}).keys())
         params = []
 
         for decl in func.get("decls", []):
@@ -464,13 +465,13 @@ class RstParser(object):
         # 1. all params are documented
         for p in params:
             if p not in documentedParams and show_warnings:
-                print >> sys.stderr, "RST parser warning W%03d: parameter \"%s\" of \"%s\" is undocumented. %s:%s" % (WARNING_007_UNDOCUMENTEDPARAM, p, func["name"], func["file"], func["line"])
+                print("RST parser warning W%03d: parameter \"%s\" of \"%s\" is undocumented. %s:%s" % (WARNING_007_UNDOCUMENTEDPARAM, p, func["name"], func["file"], func["line"]), file=sys.stderr)
 
         # 2. only real params are documented
         for p in documentedParams:
             if p not in params and show_warnings:
                 if p not in params_blacklist.get(func["name"], []):
-                    print >> sys.stderr, "RST parser warning W%03d: unexisting parameter \"%s\" of \"%s\" is documented at %s:%s" % (WARNING_008_MISSINGPARAM, p, func["name"], func["file"], func["line"])
+                    print("RST parser warning W%03d: unexisting parameter \"%s\" of \"%s\" is documented at %s:%s" % (WARNING_008_MISSINGPARAM, p, func["name"], func["file"], func["line"]), file=sys.stderr)
         return True
 
     def normalize(self, func):
@@ -541,7 +542,7 @@ class RstParser(object):
                 func["name"] = fname[4:]
                 func["method"] = fname[4:]
             elif show_warnings:
-                print >> sys.stderr, "RST parser warning W%03d:  \"%s\" - section name is \"%s\" instead of \"%s\" at %s:%s" % (WARNING_009_HDRMISMATCH, fname, func["name"], fname[6:], func["file"], func["line"])
+                print("RST parser warning W%03d:  \"%s\" - section name is \"%s\" instead of \"%s\" at %s:%s" % (WARNING_009_HDRMISMATCH, fname, func["name"], fname[6:], func["file"], func["line"]), file=sys.stderr)
                 #self.print_info(func)
 
     def normalizeText(self, s):
@@ -632,11 +633,11 @@ class RstParser(object):
         return s
 
     def printSummary(self):
-        print "RST Parser Summary:"
-        print "  Total sections:   %s" % self.sections_total
-        print "  Skipped sections: %s" % self.sections_skipped
-        print "  Parsed  sections: %s" % self.sections_parsed
-        print "  Invalid sections: %s" % (self.sections_total - self.sections_parsed - self.sections_skipped)
+        print("RST Parser Summary:")
+        print("  Total sections:   %s" % self.sections_total)
+        print("  Skipped sections: %s" % self.sections_skipped)
+        print("  Parsed  sections: %s" % self.sections_parsed)
+        print("  Invalid sections: %s" % (self.sections_total - self.sections_parsed - self.sections_skipped))
 
         # statistic by language
         stat = {}
@@ -651,12 +652,12 @@ class RstParser(object):
                 for decl in d.get("decls", []):
                     stat[decl[0]] = stat.get(decl[0], 0) + 1
 
-        print
-        print "  classes documented:           %s" % classes
-        print "  structs documented:           %s" % structs
+        print()
+        print("  classes documented:           %s" % classes)
+        print("  structs documented:           %s" % structs)
         for lang in sorted(stat.items()):
-            print "  %7s functions documented: %s" % lang
-        print
+            print("  %7s functions documented: %s" % lang)
+        print()
 
 def mathReplace2(match):
     m = mathReplace(match)
@@ -743,7 +744,7 @@ def mathReplace(match):
 
 if __name__ == "__main__":
     if len(sys.argv) < 2:
-        print "Usage:\n", os.path.basename(sys.argv[0]), " <module path>"
+        print("Usage:\n", os.path.basename(sys.argv[0]), " <module path>")
         exit(0)
 
     if len(sys.argv) >= 3:
@@ -759,7 +760,7 @@ if __name__ == "__main__":
     module = sys.argv[1]
 
     if module != "all" and not os.path.isdir(os.path.join(rst_parser_dir, "../../" + module)):
-        print "RST parser error E%03d: module \"%s\" could not be found." % (ERROR_010_NOMODULE, module)
+        print("RST parser error E%03d: module \"%s\" could not be found." % (ERROR_010_NOMODULE, module))
         exit(1)
 
     parser = RstParser(hdr_parser.CppHeaderParser())
diff --git a/modules/java/generator/src/java/android+JavaCameraView.java b/modules/java/generator/src/java/android+JavaCameraView.java
index 0acd85c19..c29ba2b6f 100644
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@@ -288,7 +288,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
         }
 
         public Mat rgba() {
-            Imgproc.cvtColor(mYuvFrameData, mRgba, Imgproc.COLOR_YUV2BGR_NV12, 4);
+            Imgproc.cvtColor(mYuvFrameData, mRgba, Imgproc.COLOR_YUV2RGBA_NV21, 4);
             return mRgba;
         }
 
diff --git a/modules/java/generator/src/java/android+NativeCameraView.java b/modules/java/generator/src/java/android+NativeCameraView.java
index 8035d0437..db146d8ae 100644
--- a/modules/java/generator/src/java/android+NativeCameraView.java
+++ b/modules/java/generator/src/java/android+NativeCameraView.java
@@ -175,7 +175,6 @@ public class NativeCameraView extends CameraBridgeViewBase {
                 }
 
                 deliverAndDrawFrame(mFrame);
-
             } while (!mStopThread);
         }
     }
diff --git a/modules/matlab/generator/build_info.py b/modules/matlab/generator/build_info.py
index 65619a2a7..1340d9f92 100644
--- a/modules/matlab/generator/build_info.py
+++ b/modules/matlab/generator/build_info.py
@@ -21,7 +21,7 @@ def substitute(build, output_dir):
     # populate template
     populated = template.render(build=build, time=time)
     with open(os.path.join(output_dir, 'buildInformation.m'), 'wb') as f:
-        f.write(populated)
+        f.write(populated.encode('utf-8'))
 
 if __name__ == "__main__":
     """
diff --git a/modules/matlab/generator/cvmex.py b/modules/matlab/generator/cvmex.py
index 52c5f649f..731d30a0e 100644
--- a/modules/matlab/generator/cvmex.py
+++ b/modules/matlab/generator/cvmex.py
@@ -22,7 +22,7 @@ def substitute(cv, output_dir):
     # populate template
     populated = template.render(cv=cv, time=time)
     with open(os.path.join(output_dir, 'mex.m'), 'wb') as f:
-        f.write(populated)
+        f.write(populated.encode('utf-8'))
 
 if __name__ == "__main__":
     """
diff --git a/modules/matlab/generator/filters.py b/modules/matlab/generator/filters.py
index 6251c8305..de69ff7e4 100644
--- a/modules/matlab/generator/filters.py
+++ b/modules/matlab/generator/filters.py
@@ -1,5 +1,4 @@
 from textwrap import TextWrapper
-from string import split, join
 import re, os
 # precompile a URL matching regular expression
 urlexpr = re.compile(r"((https?):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)", re.MULTILINE|re.UNICODE)
@@ -177,4 +176,4 @@ def comment(text, wrap=80, escape='% ', escape_first='', escape_last=''):
     escapn = '\n'+escape
     lines  = text.split('\n')
     wlines = (tw.wrap(line) for line in lines)
-    return escape_first+escape+join((join(line, escapn) for line in wlines), escapn)+escape_last
+    return escape_first+escape+escapn.join(escapn.join(line) for line in wlines)+escape_last
diff --git a/modules/matlab/generator/gen_matlab.py b/modules/matlab/generator/gen_matlab.py
index 49e575099..36d588c92 100644
--- a/modules/matlab/generator/gen_matlab.py
+++ b/modules/matlab/generator/gen_matlab.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python
+import sys, re, os, time
+from string import Template
+from parse_tree import ParseTree, todict, constants
+from filters import *
 
 class MatlabWrapperGenerator(object):
     """
@@ -22,9 +26,14 @@ class MatlabWrapperGenerator(object):
         The output_dir specifies the directory to write the generated sources
         to.
         """
+        # dynamically import the parsers
+        from jinja2 import Environment, FileSystemLoader
+        import hdr_parser
+        import rst_parser
+
         # parse each of the files and store in a dictionary
         # as a separate "namespace"
-        parser = CppHeaderParser()
+        parser = hdr_parser.CppHeaderParser()
         rst    = rst_parser.RstParser(parser)
         rst_parser.verbose = False
         rst_parser.show_warnings = False
@@ -91,13 +100,13 @@ class MatlabWrapperGenerator(object):
         output_class_dir   = output_dir+'/+cv'
         output_map_dir     = output_dir+'/map'
         if not os.path.isdir(output_source_dir):
-          os.mkdir(output_source_dir)
+          os.makedirs(output_source_dir)
         if not os.path.isdir(output_private_dir):
-          os.mkdir(output_private_dir)
+          os.makedirs(output_private_dir)
         if not os.path.isdir(output_class_dir):
-          os.mkdir(output_class_dir)
+          os.makedirs(output_class_dir)
         if not os.path.isdir(output_map_dir):
-          os.mkdir(output_map_dir)
+          os.makedirs(output_map_dir)
 
         # populate templates
         for namespace in parse_tree.namespaces:
@@ -105,27 +114,27 @@ class MatlabWrapperGenerator(object):
             for method in namespace.methods:
                 populated = tfunction.render(fun=method, time=time, includes=namespace.name)
                 with open(output_source_dir+'/'+method.name+'.cpp', 'wb') as f:
-                    f.write(populated)
+                    f.write(populated.encode('utf-8'))
                 if namespace.name in doc and method.name in doc[namespace.name]:
                     populated = tdoc.render(fun=method, doc=doc[namespace.name][method.name], time=time)
                     with open(output_class_dir+'/'+method.name+'.m', 'wb') as f:
-                        f.write(populated)
+                        f.write(populated.encode('utf-8'))
             # classes
             for clss in namespace.classes:
                 # cpp converter
                 populated = tclassc.render(clss=clss, time=time)
                 with open(output_private_dir+'/'+clss.name+'Bridge.cpp', 'wb') as f:
-                    f.write(populated)
+                    f.write(populated.encode('utf-8'))
                 # matlab classdef
                 populated = tclassm.render(clss=clss, time=time)
                 with open(output_class_dir+'/'+clss.name+'.m', 'wb') as f:
-                    f.write(populated)
+                    f.write(populated.encode('utf-8'))
 
         # create a global constants lookup table
         const = dict(constants(todict(parse_tree.namespaces)))
         populated = tconst.render(constants=const, time=time)
         with open(output_dir+'/cv.m', 'wb') as f:
-            f.write(populated)
+            f.write(populated.encode('utf-8'))
 
 
 if __name__ == "__main__":
@@ -168,7 +177,6 @@ if __name__ == "__main__":
     """
 
     # parse the input options
-    import sys, re, os, time
     from argparse import ArgumentParser
     parser = ArgumentParser()
     parser.add_argument('--jinja2')
@@ -185,13 +193,6 @@ if __name__ == "__main__":
     sys.path.append(args.hdrparser)
     sys.path.append(args.rstparser)
 
-    from string import Template
-    from hdr_parser import CppHeaderParser
-    import rst_parser
-    from parse_tree import ParseTree, todict, constants
-    from filters import *
-    from jinja2 import Environment, FileSystemLoader
-
     # create the generator
     mwg = MatlabWrapperGenerator()
     mwg.gen(args.moduleroot, args.modules, args.extra, args.outdir)
diff --git a/modules/matlab/generator/parse_tree.py b/modules/matlab/generator/parse_tree.py
index daea53c2f..a6a146a55 100644
--- a/modules/matlab/generator/parse_tree.py
+++ b/modules/matlab/generator/parse_tree.py
@@ -1,6 +1,12 @@
-from string import join
+import collections
 from textwrap import fill
 from filters import *
+try:
+  # Python 2.7+
+  basestring
+except NameError:
+  # Python 3.3+
+  basestring = str
 
 class ParseTree(object):
     """
@@ -74,7 +80,7 @@ class ParseTree(object):
         self.namespaces = namespaces if namespaces else []
 
     def __str__(self):
-        return join((ns.__str__() for ns in self.namespaces), '\n\n\n')
+        return '\n\n\n'.join(ns.__str__() for ns in self.namespaces)
 
     def build(self, namespaces):
         babel = Translator()
@@ -94,7 +100,7 @@ class ParseTree(object):
                     constants.append(obj)
                 else:
                     raise TypeError('Unexpected object type: '+str(type(obj)))
-            self.namespaces.append(Namespace(name, constants, class_tree.values(), methods))
+            self.namespaces.append(Namespace(name, constants, list(class_tree.values()), methods))
 
     def insertIntoClassTree(self, obj, class_tree):
         cname = obj.name if type(obj) is Class else obj.clss
@@ -208,9 +214,9 @@ class Namespace(object):
 
     def __str__(self):
         return 'namespace '+self.name+' {\n\n'+\
-          (join((c.__str__() for c in self.constants), '\n')+'\n\n' if self.constants else '')+\
-          (join((f.__str__() for f in self.methods), '\n')+'\n\n' if self.methods else '')+\
-          (join((o.__str__() for o in self.classes), '\n\n')        if self.classes   else '')+'\n};'
+          ('\n'.join(c.__str__() for c in self.constants)+'\n\n' if self.constants else '')+\
+          ('\n'.join(f.__str__() for f in self.methods)+'\n\n'   if self.methods   else '')+\
+          ('\n\n'.join(o.__str__() for o in self.classes)        if self.classes   else '')+'\n};'
 
 class Class(object):
     """
@@ -228,8 +234,8 @@ class Class(object):
 
     def __str__(self):
         return 'class '+self.name+' {\n\t'+\
-          (join((c.__str__() for c in self.constants), '\n\t')+'\n\n\t' if self.constants else '')+\
-          (join((f.__str__() for f in self.methods), '\n\t')          if self.methods else '')+'\n};'
+          ('\n\t'.join(c.__str__() for c in self.constants)+'\n\n\t' if self.constants else '')+\
+          ('\n\t'.join(f.__str__() for f in self.methods)            if self.methods   else '')+'\n};'
 
 class Method(object):
     """
@@ -260,7 +266,7 @@ class Method(object):
 
     def __str__(self):
         return (self.rtp+' ' if self.rtp else '')+self.name+'('+\
-          join((arg.__str__() for arg in self.req+self.opt), ', ')+\
+          ', '.join(arg.__str__() for arg in self.req+self.opt)+\
           ')'+(' const' if self.const else '')+';'
 
 class Argument(object):
@@ -334,23 +340,20 @@ def constants(tree):
             for gen in constants(val):
                 yield gen
 
-def todict(obj, classkey=None):
+
+def todict(obj):
     """
-    Convert the ParseTree to a dictionary, stripping all objects of their
-    methods and converting class names to strings
+    Recursively convert a Python object graph to sequences (lists)
+    and mappings (dicts) of primitives (bool, int, float, string, ...)
     """
-    if isinstance(obj, dict):
-        for k in obj.keys():
-            obj[k] = todict(obj[k], classkey)
-        return obj
-    elif hasattr(obj, "__iter__"):
-        return [todict(v, classkey) for v in obj]
-    elif hasattr(obj, "__dict__"):
-        data = dict([(key, todict(value, classkey))
-            for key, value in obj.__dict__.iteritems()
-            if not callable(value) and not key.startswith('_')])
-        if classkey is not None and hasattr(obj, "__class__"):
-            data[classkey] = obj.__class__.__name__
-        return data
-    else:
+    if isinstance(obj, basestring):
         return obj
+    elif isinstance(obj, dict):
+        return dict((key, todict(val)) for key, val in obj.items())
+    elif isinstance(obj, collections.Iterable):
+        return [todict(val) for val in obj]
+    elif hasattr(obj, '__dict__'):
+        return todict(vars(obj))
+    elif hasattr(obj, '__slots__'):
+        return todict(dict((name, getattr(obj, name)) for name in getattr(obj, '__slots__')))
+    return obj
diff --git a/modules/nonfree/perf/perf_precomp.hpp b/modules/nonfree/perf/perf_precomp.hpp
index 240bb65a7..45478eb8a 100644
--- a/modules/nonfree/perf/perf_precomp.hpp
+++ b/modules/nonfree/perf/perf_precomp.hpp
@@ -9,12 +9,13 @@
 #ifndef __OPENCV_PERF_PRECOMP_HPP__
 #define __OPENCV_PERF_PRECOMP_HPP__
 
+#include "cvconfig.h"
+
 #include "opencv2/ts.hpp"
 #include "opencv2/nonfree.hpp"
 #include "opencv2/highgui.hpp"
 
 #include "opencv2/opencv_modules.hpp"
-#include "cvconfig.h"
 
 #ifdef HAVE_OPENCV_OCL
 #  include "opencv2/nonfree/ocl.hpp"
diff --git a/modules/nonfree/perf/perf_surf.cpp b/modules/nonfree/perf/perf_surf.cpp
index d9129775f..09de5232e 100644
--- a/modules/nonfree/perf/perf_surf.cpp
+++ b/modules/nonfree/perf/perf_surf.cpp
@@ -16,9 +16,7 @@ PERF_TEST_P(surf, detect, testing::Values(SURF_IMAGES))
 {
     string filename = getDataPath(GetParam());
     Mat frame = imread(filename, IMREAD_GRAYSCALE);
-
-    if (frame.empty())
-        FAIL() << "Unable to load source image " << filename;
+    ASSERT_FALSE(frame.empty()) << "Unable to load source image " << filename;
 
     Mat mask;
     declare.in(frame).time(90);
@@ -34,9 +32,7 @@ PERF_TEST_P(surf, extract, testing::Values(SURF_IMAGES))
 {
     string filename = getDataPath(GetParam());
     Mat frame = imread(filename, IMREAD_GRAYSCALE);
-
-    if (frame.empty())
-        FAIL() << "Unable to load source image " << filename;
+    ASSERT_FALSE(frame.empty()) << "Unable to load source image " << filename;
 
     Mat mask;
     declare.in(frame).time(90);
@@ -55,9 +51,7 @@ PERF_TEST_P(surf, full, testing::Values(SURF_IMAGES))
 {
     string filename = getDataPath(GetParam());
     Mat frame = imread(filename, IMREAD_GRAYSCALE);
-
-    if (frame.empty())
-        FAIL() << "Unable to load source image " << filename;
+    ASSERT_FALSE(frame.empty()) << "Unable to load source image " << filename;
 
     Mat mask;
     declare.in(frame).time(90);
diff --git a/modules/objdetect/perf/opencl/perf_cascades.cpp b/modules/objdetect/perf/opencl/perf_cascades.cpp
index b660f5911..dd61cdb66 100644
--- a/modules/objdetect/perf/opencl/perf_cascades.cpp
+++ b/modules/objdetect/perf/opencl/perf_cascades.cpp
@@ -18,8 +18,6 @@ OCL_PERF_TEST_P(Cascade_Image_MinSize, CascadeClassifier,
                  testing::Combine(
                     testing::Values( string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt.xml"),
                                      string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt2.xml"),
-                                     string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt_old.xml"),
-                                     string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt2_old.xml"),
                                      string("cv/cascadeandhog/cascades/lbpcascade_frontalface.xml") ),
                     testing::Values( string("cv/shared/lena.png"),
                                      string("cv/cascadeandhog/images/bttf301.png"),
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 3f0e6e38c..2d5c0795d 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -765,11 +765,8 @@ bool LBPEvaluator::read( const FileNode& node, Size _origWinSize )
     nchannels = 1;
     localSize = lbufSize = Size(0, 0);
     if (ocl::haveOpenCL())
-    {
-        const ocl::Device& device = ocl::Device::getDefault();
-        if (device.isAMD() && !device.hostUnifiedMemory())
-            localSize = Size(8, 8);
-    }
+        localSize = Size(8, 8);
+
     return true;
 }
 
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index 18bb7afc2..0f4456ad5 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -1085,8 +1085,8 @@ static bool ocl_compute_gradients_8UC1(int height, int width, InputArray _img, f
     size_t globalThreads[3] = { width, height, 1 };
     char correctGamma = (correct_gamma) ? 1 : 0;
     int grad_quadstep = (int)grad.step >> 3;
-    int qangle_step_shift = 0;
-    int qangle_step = (int)qangle.step >> (1 + qangle_step_shift);
+    int qangle_elem_size = CV_ELEM_SIZE1(qangle.type());
+    int qangle_step = (int)qangle.step / (2 * qangle_elem_size);
 
     int idx = 0;
     idx = k.set(idx, height);
@@ -1137,9 +1137,9 @@ static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y,
     int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y;
     int blocks_total = img_block_width * img_block_height;
 
-    int qangle_step_shift = 0;
+    int qangle_elem_size = CV_ELEM_SIZE1(qangle.type());
     int grad_quadstep = (int)grad.step >> 2;
-    int qangle_step = (int)qangle.step >> qangle_step_shift;
+    int qangle_step = (int)qangle.step / qangle_elem_size;
 
     int blocks_in_group = 4;
     size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
@@ -1316,11 +1316,12 @@ static bool ocl_extract_descrs_by_cols(int win_height, int win_width, int block_
 static bool ocl_compute(InputArray _img, Size win_stride, std::vector<float>& _descriptors, int descr_format, Size blockSize,
                         Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold)
 {
-     Size imgSize = _img.size();
+    Size imgSize = _img.size();
     Size effect_size = imgSize;
 
     UMat grad(imgSize, CV_32FC2);
-    UMat qangle(imgSize, CV_8UC2);
+    int qangle_type = ocl::Device::getDefault().isIntel() ? CV_32SC2 : CV_8UC2;
+    UMat qangle(imgSize, qangle_type);
 
     const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
     const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
@@ -1720,7 +1721,8 @@ static bool ocl_detect(InputArray img, std::vector<Point> &hits, double hit_thre
     Size imgSize = img.size();
     Size effect_size = imgSize;
     UMat grad(imgSize, CV_32FC2);
-    UMat qangle(imgSize, CV_8UC2);
+    int qangle_type = ocl::Device::getDefault().isIntel() ? CV_32SC2 : CV_8UC2;
+    UMat qangle(imgSize, qangle_type);
 
     const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
     const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl
index 082f9ab7f..704dec444 100644
--- a/modules/objdetect/src/opencl/objdetect_hog.cl
+++ b/modules/objdetect/src/opencl/objdetect_hog.cl
@@ -50,6 +50,14 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 
+#ifdef INTEL_DEVICE
+#define QANGLE_TYPE     int
+#define QANGLE_TYPE2    int2
+#else
+#define QANGLE_TYPE     uchar
+#define QANGLE_TYPE2    uchar2
+#endif
+
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel(
     const int cnbins, const int cblock_hist_size, const int img_block_width,
     const int blocks_in_group, const int blocks_total,
     const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
+    __global const float* grad, __global const QANGLE_TYPE* qangle,
     __global const float* gauss_w_lut,
     __global float* block_hists, __local float* smem)
 {
@@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel(
 
     __global const float* grad_ptr = (gid < blocks_total) ?
         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+    __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
 
     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
@@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel(
     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
     {
         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+        QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
 
         grad_ptr += grad_quadstep;
         qangle_ptr += qangle_step;
@@ -133,9 +141,8 @@ __kernel void compute_hists_lut_kernel(
             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
                 hist_[0] + hist_[1] + hist_[2];
     }
-#ifdef CPU
+
     barrier(CLK_LOCAL_MEM_FENCE);
-#endif
 
     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
     if ((tid < cblock_hist_size) && (gid < blocks_total))
@@ -558,7 +565,7 @@ __kernel void extract_descrs_by_cols_kernel(
 __kernel void compute_gradients_8UC4_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
@@ -660,7 +667,7 @@ __kernel void compute_gradients_8UC4_kernel(
 __kernel void compute_gradients_8UC1_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
diff --git a/modules/objdetect/test/opencl/test_hogdetector.cpp b/modules/objdetect/test/opencl/test_hogdetector.cpp
index 8568352b6..b3ef6b48f 100644
--- a/modules/objdetect/test/opencl/test_hogdetector.cpp
+++ b/modules/objdetect/test/opencl/test_hogdetector.cpp
@@ -110,7 +110,7 @@ OCL_TEST_P(HOG, Detect)
     OCL_OFF(hog.detectMultiScale(img, cpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
     OCL_ON(hog.detectMultiScale(uimg, gpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
 
-    EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 1.0);
+    EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 0.05);
 }
 
 INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 515237983..2ec9b9487 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -72,7 +72,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     int type = _src.type(), cn = CV_MAT_CN(type);
     Size size = _src.size();
 
-    if ( type != CV_8UC1 || type != CV_8UC2 || type != CV_8UC4 )
+    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
         return false;
 
     int templateWindowHalfWize = templateWindowSize / 2;
diff --git a/modules/photo/src/merge.cpp b/modules/photo/src/merge.cpp
index 7adfb5ec6..295e03c95 100644
--- a/modules/photo/src/merge.cpp
+++ b/modules/photo/src/merge.cpp
@@ -208,7 +208,7 @@ public:
             if(channels == 3) {
                 weights[i] = weights[i].mul(saturation);
             }
-            weights[i] = weights[i].mul(wellexp);
+            weights[i] = weights[i].mul(wellexp) + 1e-12f;
             weight_sum += weights[i];
         }
         int maxlevel = static_cast<int>(logf(static_cast<float>(min(size.width, size.height))) / logf(2.0f));
diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index c4a9f05f3..b533399cc 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -19,8 +19,8 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
     float h;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+    TEST_DECLARE_INPUT_PARAMETER(src);
+    TEST_DECLARE_OUTPUT_PARAMETER(dst);
 
     virtual void SetUp()
     {
@@ -52,8 +52,8 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+        UMAT_UPLOAD_INPUT_PARAMETER(src);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
     }
 };
 
@@ -68,7 +68,7 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
         OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
         OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
 
-        OCL_EXPECT_MATS_NEAR(dst, 1)
+        OCL_EXPECT_MATS_NEAR(dst, 1);
     }
 }
 
@@ -83,7 +83,7 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
         OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize));
         OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize));
 
-        OCL_EXPECT_MATS_NEAR(dst, 1)
+        OCL_EXPECT_MATS_NEAR(dst, 1);
     }
 }
 
diff --git a/modules/photo/test/test_hdr.cpp b/modules/photo/test/test_hdr.cpp
index 82ae25f52..27773fb38 100644
--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -166,6 +166,16 @@ TEST(Photo_MergeMertens, regression)
     merge->process(images, result);
     result.convertTo(result, CV_8UC3, 255);
     checkEqual(expected, result, 3, "Mertens");
+
+    Mat uniform(100, 100, CV_8UC3);
+    uniform = Scalar(0, 255, 0);
+
+    images.clear();
+    images.push_back(uniform);
+
+    merge->process(images, result);
+    result.convertTo(result, CV_8UC3, 255);
+    checkEqual(uniform, result, 1e-2f, "Mertens");
 }
 
 TEST(Photo_MergeDebevec, regression)
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 092b1cd40..c360303f6 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(ANDROID OR IOS)
+if(IOS)
   ocv_module_disable(superres)
 endif()
 
diff --git a/modules/superres/src/btv_l1.cpp b/modules/superres/src/btv_l1.cpp
index 1e4aa48a7..d54b4b398 100644
--- a/modules/superres/src/btv_l1.cpp
+++ b/modules/superres/src/btv_l1.cpp
@@ -1014,10 +1014,8 @@ namespace
             return;
 
 #ifdef HAVE_OPENCL
-        if (isUmat_ && curFrame_.channels() == 1)
+        if (isUmat_)
             curFrame_.copyTo(ucurFrame_);
-        else
-            isUmat_ = false;
 #endif
         ++storePos_;
 
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index 169e34fdc..0f0d9657f 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -62,53 +62,73 @@ extern int test_loop_times;
 #define MAX_VALUE 357
 
 #define EXPECT_MAT_NORM(mat, eps) \
+do \
 { \
-    EXPECT_LE(TestUtils::checkNorm(mat), eps) \
-}
+    EXPECT_LE(TestUtils::checkNorm1(mat), eps) \
+} while ((void)0, 0)
 
 #define EXPECT_MAT_NEAR(mat1, mat2, eps) \
+do \
 { \
     ASSERT_EQ(mat1.type(), mat2.type()); \
     ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(TestUtils::checkNorm(mat1, mat2), eps) \
+    EXPECT_LE(TestUtils::checkNorm2(mat1, mat2), eps) \
         << "Size: " << mat1.size() << std::endl; \
-}
+} while ((void)0, 0)
 
 #define EXPECT_MAT_NEAR_RELATIVE(mat1, mat2, eps) \
+do \
 { \
     ASSERT_EQ(mat1.type(), mat2.type()); \
     ASSERT_EQ(mat1.size(), mat2.size()); \
     EXPECT_LE(TestUtils::checkNormRelative(mat1, mat2), eps) \
         << "Size: " << mat1.size() << std::endl; \
-}
+} while ((void)0, 0)
 
 #define OCL_EXPECT_MATS_NEAR(name, eps) \
+do \
 { \
-    EXPECT_MAT_NEAR(name ## _roi, u ## name ## _roi, eps); \
-    int nextValue = rng.next(); \
-    RNG dataRng1(nextValue), dataRng2(nextValue); \
-    dataRng1.fill(name ## _roi, RNG::UNIFORM, Scalar::all(-MAX_VALUE), Scalar::all(MAX_VALUE)); \
-    dataRng2.fill(u ## name ## _roi, RNG::UNIFORM, Scalar::all(-MAX_VALUE), Scalar::all(MAX_VALUE)); \
-    EXPECT_MAT_NEAR(name, u ## name, 0/*FLT_EPSILON*/); \
-}
+    ASSERT_EQ(name ## _roi.type(), u ## name ## _roi.type()); \
+    ASSERT_EQ(name ## _roi.size(), u ## name ## _roi.size()); \
+    EXPECT_LE(TestUtils::checkNorm2(name ## _roi, u ## name ## _roi), eps) \
+        << "Size: " << name ## _roi.size() << std::endl; \
+    Point _offset; \
+    Size _wholeSize; \
+    u ## name ## _roi.locateROI(_wholeSize, _offset); \
+    Mat _mask(name.size(), CV_8UC1, Scalar::all(255)); \
+    _mask(Rect(_offset, name ## _roi.size())).setTo(Scalar::all(0)); \
+    ASSERT_EQ(name.type(), u ## name.type()); \
+    ASSERT_EQ(name.size(), u ## name.size()); \
+    EXPECT_LE(TestUtils::checkNorm2(name, u ## name, _mask), eps) \
+        << "Size: " << name ## _roi.size() << std::endl; \
+} while ((void)0, 0)
 
 #define OCL_EXPECT_MATS_NEAR_RELATIVE(name, eps) \
+do \
 { \
-    EXPECT_MAT_NEAR_RELATIVE(name ## _roi, u ## name ## _roi, eps); \
-    int nextValue = rng.next(); \
-    RNG dataRng1(nextValue), dataRng2(nextValue); \
-    dataRng1.fill(name ## _roi, RNG::UNIFORM, Scalar::all(-MAX_VALUE), Scalar::all(MAX_VALUE)); \
-    dataRng2.fill(u ## name ## _roi, RNG::UNIFORM, Scalar::all(-MAX_VALUE), Scalar::all(MAX_VALUE)); \
-    EXPECT_MAT_NEAR_RELATIVE(name, u ## name, 0/*FLT_EPSILON*/); \
-}
+    ASSERT_EQ(name ## _roi.type(), u ## name ## _roi.type()); \
+    ASSERT_EQ(name ## _roi.size(), u ## name ## _roi.size()); \
+    EXPECT_LE(TestUtils::checkNormRelative(name ## _roi, u ## name ## _roi), eps) \
+        << "Size: " << name ## _roi.size() << std::endl; \
+    Point _offset; \
+    Size _wholeSize; \
+    name ## _roi.locateROI(_wholeSize, _offset); \
+    Mat _mask(name.size(), CV_8UC1, Scalar::all(255)); \
+    _mask(Rect(_offset, name ## _roi.size())).setTo(Scalar::all(0)); \
+    ASSERT_EQ(name.type(), u ## name.type()); \
+    ASSERT_EQ(name.size(), u ## name.size()); \
+    EXPECT_LE(TestUtils::checkNormRelative(name, u ## name, _mask), eps) \
+        << "Size: " << name ## _roi.size() << std::endl; \
+} while ((void)0, 0)
 
 #define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
+do \
 { \
     ASSERT_EQ(mat1.type(), mat2.type()); \
     ASSERT_EQ(mat1.size(), mat2.size()); \
     EXPECT_LE(checkSimilarity(mat1, mat2), eps) \
         << "Size: " << mat1.size() << std::endl; \
-}
+} while ((void)0, 0)
 
 using perf::MatDepth;
 using perf::MatType;
@@ -205,28 +225,30 @@ struct CV_EXPORTS TestUtils
     static cv::Mat readImage(const String &fileName, int flags = cv::IMREAD_COLOR);
     static cv::Mat readImageType(const String &fname, int type);
 
-    static double checkNorm(InputArray m);
-    static double checkNorm(InputArray m1, InputArray m2);
+    static double checkNorm1(InputArray m, InputArray mask = noArray());
+    static double checkNorm2(InputArray m1, InputArray m2, InputArray mask = noArray());
     static double checkSimilarity(InputArray m1, InputArray m2);
     static void showDiff(InputArray _src, InputArray _gold, InputArray _actual, double eps, bool alwaysShow);
 
-    static inline double checkNormRelative(InputArray m1, InputArray m2)
+    static inline double checkNormRelative(InputArray m1, InputArray m2, InputArray mask = noArray())
     {
-        return cv::norm(m1.getMat(), m2.getMat(), cv::NORM_INF) /
+        return cv::norm(m1.getMat(), m2.getMat(), cv::NORM_INF, mask) /
                 std::max((double)std::numeric_limits<float>::epsilon(),
                          (double)std::max(cv::norm(m1.getMat(), cv::NORM_INF), norm(m2.getMat(), cv::NORM_INF)));
     }
 };
 
-#define TEST_DECLARE_INPUT_PARAMETER(name) Mat name, name ## _roi; UMat u ## name, u ## name ## _roi;
+#define TEST_DECLARE_INPUT_PARAMETER(name) Mat name, name ## _roi; UMat u ## name, u ## name ## _roi
 #define TEST_DECLARE_OUTPUT_PARAMETER(name) TEST_DECLARE_INPUT_PARAMETER(name)
 
 #define UMAT_UPLOAD_INPUT_PARAMETER(name) \
+do \
 { \
     name.copyTo(u ## name); \
     Size _wholeSize; Point ofs; name ## _roi.locateROI(_wholeSize, ofs); \
     u ## name ## _roi = u ## name(Rect(ofs.x, ofs.y, name ## _roi.size().width, name ## _roi.size().height)); \
-}
+} while ((void)0, 0)
+
 #define UMAT_UPLOAD_OUTPUT_PARAMETER(name) UMAT_UPLOAD_INPUT_PARAMETER(name)
 
 template <typename T>
diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp
index e3b6481d1..62e9e1471 100644
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -4,6 +4,8 @@
 #include "opencv2/core.hpp"
 #include "ts_gtest.h"
 
+#include <functional>
+
 #if !(defined(LOGD) || defined(LOGI) || defined(LOGW) || defined(LOGE))
 # if defined(ANDROID) && defined(USE_ANDROID_LOGGING)
 #  include <android/log.h>
@@ -555,31 +557,33 @@ namespace comparators
 {
 
 template<typename T>
-struct CV_EXPORTS RectLess_
+struct CV_EXPORTS RectLess_ :
+        public std::binary_function<cv::Rect_<T>, cv::Rect_<T>, bool>
 {
   bool operator()(const cv::Rect_<T>& r1, const cv::Rect_<T>& r2) const
   {
-    return r1.x < r2.x
-      || (r1.x == r2.x && r1.y < r2.y)
-      || (r1.x == r2.x && r1.y == r2.y && r1.width < r2.width)
-      || (r1.x == r2.x && r1.y == r2.y && r1.width == r2.width && r1.height < r2.height);
+    return r1.x < r2.x ||
+            (r1.x == r2.x && r1.y < r2.y) ||
+            (r1.x == r2.x && r1.y == r2.y && r1.width < r2.width) ||
+            (r1.x == r2.x && r1.y == r2.y && r1.width == r2.width && r1.height < r2.height);
   }
 };
 
 typedef RectLess_<int> RectLess;
 
-struct CV_EXPORTS KeypointGreater
+struct CV_EXPORTS KeypointGreater :
+        public std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
 {
     bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
     {
-        if(kp1.response > kp2.response) return true;
-        if(kp1.response < kp2.response) return false;
-        if(kp1.size > kp2.size) return true;
-        if(kp1.size < kp2.size) return false;
-        if(kp1.octave > kp2.octave) return true;
-        if(kp1.octave < kp2.octave) return false;
-        if(kp1.pt.y < kp2.pt.y) return false;
-        if(kp1.pt.y > kp2.pt.y) return true;
+        if (kp1.response > kp2.response) return true;
+        if (kp1.response < kp2.response) return false;
+        if (kp1.size > kp2.size) return true;
+        if (kp1.size < kp2.size) return false;
+        if (kp1.octave > kp2.octave) return true;
+        if (kp1.octave < kp2.octave) return false;
+        if (kp1.pt.y < kp2.pt.y) return false;
+        if (kp1.pt.y > kp2.pt.y) return true;
         return kp1.pt.x < kp2.pt.x;
     }
 };
diff --git a/modules/ts/misc/table_formatter.py b/modules/ts/misc/table_formatter.py
index 9baff0f79..2e1467b80 100755
--- a/modules/ts/misc/table_formatter.py
+++ b/modules/ts/misc/table_formatter.py
@@ -426,7 +426,7 @@ class table(object):
                     if r == 0:
                         css = css[:-1] + "border-top:2px solid #6678B1;\""
                 out.write("   <td%s%s>\n" % (attr, css))
-                if th is not None:
+                if td is not None:
                     out.write("    %s\n" % htmlEncode(td.text))
                 out.write("   </td>\n")
                 i += colspan
diff --git a/modules/ts/src/ocl_test.cpp b/modules/ts/src/ocl_test.cpp
index caf5bf410..0291cadbe 100644
--- a/modules/ts/src/ocl_test.cpp
+++ b/modules/ts/src/ocl_test.cpp
@@ -223,14 +223,14 @@ Mat TestUtils::readImageType(const String &fname, int type)
     return src;
 }
 
-double TestUtils::checkNorm(InputArray m)
+double TestUtils::checkNorm1(InputArray m, InputArray mask)
 {
-    return norm(m.getMat(), NORM_INF);
+    return norm(m.getMat(), NORM_INF, mask);
 }
 
-double TestUtils::checkNorm(InputArray m1, InputArray m2)
+double TestUtils::checkNorm2(InputArray m1, InputArray m2, InputArray mask)
 {
-    return norm(m1.getMat(), m2.getMat(), NORM_INF);
+    return norm(m1.getMat(), m2.getMat(), NORM_INF, mask);
 }
 
 double TestUtils::checkSimilarity(InputArray m1, InputArray m2)
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index f1403b9c5..25981a721 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -20,7 +20,7 @@ static std::vector<std::string> available_impls;
 static std::string  param_impl;
 
 static enum PERF_STRATEGY strategyForce = PERF_STRATEGY_DEFAULT;
-static enum PERF_STRATEGY strategyModule = PERF_STRATEGY_BASE;
+static enum PERF_STRATEGY strategyModule = PERF_STRATEGY_SIMPLE;
 
 static double       param_max_outliers;
 static double       param_max_deviation;
diff --git a/modules/video/src/bgfg_gaussmix2.cpp b/modules/video/src/bgfg_gaussmix2.cpp
index 1e6ee0d88..098310f6b 100644
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@ -578,7 +578,7 @@ public:
                 for( int mode = 0; mode < nmodes; mode++, mean_m += nchannels )
                 {
                     float weight = alpha1*gmm[mode].weight + prune;//need only weight if fit is found
-
+                    int swap_count = 0;
                     ////
                     //fit not found yet
                     if( !fitsPDF )
@@ -643,6 +643,7 @@ public:
                                 if( weight < gmm[i-1].weight )
                                     break;
 
+                                swap_count++;
                                 //swap one up
                                 std::swap(gmm[i], gmm[i-1]);
                                 for( int c = 0; c < nchannels; c++ )
@@ -660,7 +661,7 @@ public:
                         nmodes--;
                     }
 
-                    gmm[mode].weight = weight;//update weight by the calculated value
+                    gmm[mode-swap_count].weight = weight;//update weight by the calculated value
                     totalWeight += weight;
                 }
                 //go through all modes
@@ -918,4 +919,4 @@ Ptr<BackgroundSubtractorMOG2> createBackgroundSubtractorMOG2(int _history, doubl
 
 }
 
-/* End of file. */
\ No newline at end of file
+/* End of file. */
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index c95835d9c..5b653c9d1 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -975,9 +975,7 @@ namespace cv
             idxArg = kernel.set(idxArg, imageI); //image2d_t I
             idxArg = kernel.set(idxArg, imageJ); //image2d_t J
             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
-            idxArg = kernel.set(idxArg, (int)prevPts.step); // int prevPtsStep
             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
-            idxArg = kernel.set(idxArg, (int)nextPts.step); //  int nextPtsStep
             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
             idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
             idxArg = kernel.set(idxArg, (int)level); // const int level
diff --git a/modules/video/src/opencl/bgfg_mog2.cl b/modules/video/src/opencl/bgfg_mog2.cl
index f895b5be7..9bc18b215 100644
--- a/modules/video/src/opencl/bgfg_mog2.cl
+++ b/modules/video/src/opencl/bgfg_mog2.cl
@@ -102,7 +102,7 @@ __kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame
         {
 
             float c_weight = alpha1 * _weight[(mode * frame_row + y) * weight_step + x] + prune;
-
+            int swap_count = 0;
             if (!fitsPDF)
             {
                 float c_var = _variance[(mode * frame_row + y) * var_step + x];
@@ -132,6 +132,7 @@ __kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame
                     {
                         if (c_weight < _weight[((i - 1) * frame_row + y) * weight_step + x])
                             break;
+                        swap_count++;
                         swap(_weight, x, y, i - 1, frame_row, weight_step);
                         swap(_variance, x, y, i - 1, frame_row, var_step);
                         #if (CN==1)
@@ -149,7 +150,7 @@ __kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame
                 nmodes--;
             }
 
-            _weight[(mode * frame_row + y) * weight_step + x] = c_weight; //update weight by the calculated value
+            _weight[((mode - swap_count) * frame_row + y) * weight_step + x] = c_weight; //update weight by the calculated value
             totalWeight += c_weight;
         }
 
diff --git a/modules/video/src/opencl/pyrlk.cl b/modules/video/src/opencl/pyrlk.cl
index 45571c7b6..1e27c8af5 100644
--- a/modules/video/src/opencl/pyrlk.cl
+++ b/modules/video/src/opencl/pyrlk.cl
@@ -264,47 +264,6 @@ inline void GetError(image2d_t J, const float x, const float y, const float* Pch
     *errval += fabs(diff);
 }
 
-inline void SetPatch4(image2d_t I, const float x, const float y,
-               float4* Pch, float4* Dx, float4* Dy,
-               float* A11, float* A12, float* A22)
-{
-    *Pch = read_imagef(I, sampler, (float2)(x, y));
-
-    float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
-                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)) + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)));
-
-    float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)) -
-                  (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)) + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)) + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)));
-
-
-    *Dx = dIdx;
-    *Dy = dIdy;
-    float4 sqIdx = dIdx * dIdx;
-    *A11 += sqIdx.x + sqIdx.y + sqIdx.z;
-    sqIdx = dIdx * dIdy;
-    *A12 += sqIdx.x + sqIdx.y + sqIdx.z;
-    sqIdx = dIdy * dIdy;
-    *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
-}
-
-inline void GetPatch4(image2d_t J, const float x, const float y,
-               const float4* Pch, const float4* Dx, const float4* Dy,
-               float* b1, float* b2)
-{
-    float4 J_val = read_imagef(J, sampler, (float2)(x, y));
-    float4 diff = (J_val - *Pch) * 32.0f;
-    float4 xdiff = diff* *Dx;
-    *b1 += xdiff.x + xdiff.y + xdiff.z;
-    xdiff = diff* *Dy;
-    *b2 += xdiff.x + xdiff.y + xdiff.z;
-}
-
-inline void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
-{
-    float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
-    *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
-}
-
 
 //macro to read pixel value into local memory.
 #define READI(_y,_x)    IPatchLocal[yid+((_y)*LSy)][xid+((_x)*LSx)] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;
@@ -460,7 +419,6 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
                 status[gid] = 0;
             break;
         }
-
         float b1 = 0;
         float b2 = 0;
 
diff --git a/modules/video/test/ocl/test_bgfg_mog2.cpp b/modules/video/test/ocl/test_bgfg_mog2.cpp
index bfb1621fe..0a52227ec 100644
--- a/modules/video/test/ocl/test_bgfg_mog2.cpp
+++ b/modules/video/test/ocl/test_bgfg_mog2.cpp
@@ -70,7 +70,7 @@ OCL_TEST_P(Mog2_Update, Accuracy)
         OCL_ON (mog2_ocl->apply(frame, u_foreground));
 
         if (detectShadow)
-            EXPECT_MAT_SIMILAR(foreground, u_foreground, 15e-3)
+            EXPECT_MAT_SIMILAR(foreground, u_foreground, 15e-3);
         else
             EXPECT_MAT_NEAR(foreground, u_foreground, 0);
     }
@@ -133,4 +133,4 @@ OCL_INSTANTIATE_TEST_CASE_P(OCL_Video, Mog2_getBackgroundImage, (Values(DetectSh
 }}// namespace cvtest::ocl
 
     #endif
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/video/test/ocl/test_motempl.cpp b/modules/video/test/ocl/test_motempl.cpp
index 7b4c22755..91053d9ad 100644
--- a/modules/video/test/ocl/test_motempl.cpp
+++ b/modules/video/test/ocl/test_motempl.cpp
@@ -18,8 +18,8 @@ PARAM_TEST_CASE(UpdateMotionHistory, bool)
     double timestamp, duration;
     bool use_roi;
 
-    TEST_DECLARE_INPUT_PARAMETER(silhouette)
-    TEST_DECLARE_OUTPUT_PARAMETER(mhi)
+    TEST_DECLARE_INPUT_PARAMETER(silhouette);
+    TEST_DECLARE_OUTPUT_PARAMETER(mhi);
 
     virtual void SetUp()
     {
@@ -40,8 +40,8 @@ PARAM_TEST_CASE(UpdateMotionHistory, bool)
         if (timestamp < duration)
             std::swap(timestamp, duration);
 
-        UMAT_UPLOAD_INPUT_PARAMETER(silhouette)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(mhi)
+        UMAT_UPLOAD_INPUT_PARAMETER(silhouette);
+        UMAT_UPLOAD_OUTPUT_PARAMETER(mhi);
     }
 };
 
@@ -54,7 +54,7 @@ OCL_TEST_P(UpdateMotionHistory, Mat)
         OCL_OFF(cv::updateMotionHistory(silhouette_roi, mhi_roi, timestamp, duration));
         OCL_ON(cv::updateMotionHistory(usilhouette_roi, umhi_roi, timestamp, duration));
 
-        OCL_EXPECT_MATS_NEAR(mhi, 0)
+        OCL_EXPECT_MATS_NEAR(mhi, 0);
     }
 }
 
diff --git a/modules/video/test/ocl/test_optflow_farneback.cpp b/modules/video/test/ocl/test_optflow_farneback.cpp
index c2d13e006..cc40f749b 100644
--- a/modules/video/test/ocl/test_optflow_farneback.cpp
+++ b/modules/video/test/ocl/test_optflow_farneback.cpp
@@ -101,7 +101,7 @@ OCL_TEST_P(FarnebackOpticalFlow, Mat)
     OCL_OFF(cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags));
     OCL_ON(cv::calcOpticalFlowFarneback(frame0, frame1, uflow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags));
 
-    EXPECT_MAT_SIMILAR(flow, uflow, 0.1)
+    EXPECT_MAT_SIMILAR(flow, uflow, 0.1);
 }
 
 
@@ -117,4 +117,4 @@ OCL_INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlow,
 
 } } // namespace cvtest::ocl
 
-#endif // HAVE_OPENCL
\ No newline at end of file
+#endif // HAVE_OPENCL
diff --git a/modules/video/test/test_tvl1optflow.cpp b/modules/video/test/test_tvl1optflow.cpp
index 804eae8b6..274c13e65 100644
--- a/modules/video/test/test_tvl1optflow.cpp
+++ b/modules/video/test/test_tvl1optflow.cpp
@@ -133,14 +133,13 @@ namespace
                 }
             }
         }
-
         return sqrt(sum / (1e-9 + counter));
     }
 }
 
 TEST(Video_calcOpticalFlowDual_TVL1, Regression)
 {
-    const double MAX_RMSE = 0.02;
+    const double MAX_RMSE = 0.03;
 
     const string frame1_path = TS::ptr()->get_data_path() + "optflow/RubberWhale1.png";
     const string frame2_path = TS::ptr()->get_data_path() + "optflow/RubberWhale2.png";
diff --git a/modules/viz/doc/widget.rst b/modules/viz/doc/widget.rst
index a8d9ac6fd..906adf9ba 100644
--- a/modules/viz/doc/widget.rst
+++ b/modules/viz/doc/widget.rst
@@ -1050,7 +1050,7 @@ viz::WWidgetMerger::WWidgetMerger
 ---------------------------------------
 Constructs a WWidgetMerger.
 
-.. ocv:WWidgetMerger:: WWidgetMerger()
+.. ocv:function:: WWidgetMerger()
 
 viz::WWidgetMerger::addCloud
 -------------------------------
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
index f4dd4e5e4..424a38e93 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/compareHist_Demo.cpp
@@ -40,13 +40,13 @@ int main( int argc, char** argv )
 
     hsv_half_down = hsv_base( Range( hsv_base.rows/2, hsv_base.rows - 1 ), Range( 0, hsv_base.cols - 1 ) );
 
-    /// Using 30 bins for hue and 32 for saturation
+    /// Using 50 bins for hue and 60 for saturation
     int h_bins = 50; int s_bins = 60;
     int histSize[] = { h_bins, s_bins };
 
-    // hue varies from 0 to 256, saturation from 0 to 180
-    float s_ranges[] = { 0, 256 };
+    // hue varies from 0 to 179, saturation from 0 to 255
     float h_ranges[] = { 0, 180 };
+    float s_ranges[] = { 0, 256 };
 
     const float* ranges[] = { h_ranges, s_ranges };
 
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 46b465a87..ca5243aa0 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -49,8 +49,8 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
 
     target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})
 
-    if(HAVE_CUDA)
-        target_link_libraries(${the_target} ${CUDA_CUDA_LIBRARY})
+    if(HAVE_CUDA AND NOT ANDROID)
+      target_link_libraries(${the_target} ${CUDA_CUDA_LIBRARY})
     endif()
 
     if(HAVE_opencv_nonfree)
diff --git a/samples/gpu/brox_optical_flow.cpp b/samples/gpu/brox_optical_flow.cpp
index 08973861f..638aade45 100644
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <iomanip>
 #include <string>
-#include <cctype>
+#include <ctype.h>
 
 #include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp
index 29aaa0645..63eebfdaf 100644
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@@ -7,6 +7,7 @@
 #include <memory>
 #include <exception>
 #include <ctime>
+#include <ctype.h>
 
 #include "cvconfig.h"
 #include <iostream>
diff --git a/samples/gpu/super_resolution.cpp b/samples/gpu/super_resolution.cpp
index 3066e8f74..4e3de21db 100644
--- a/samples/gpu/super_resolution.cpp
+++ b/samples/gpu/super_resolution.cpp
@@ -1,6 +1,8 @@
 #include <iostream>
 #include <iomanip>
 #include <string>
+#include <ctype.h>
+
 #include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/highgui.hpp"
diff --git a/samples/python2/asift.py b/samples/python2/asift.py
index e522fa7ab..25320865c 100755
--- a/samples/python2/asift.py
+++ b/samples/python2/asift.py
@@ -16,7 +16,7 @@ USAGE
   --feature  - Feature to use. Can be sift, surf, orb or brisk. Append '-flann'
                to feature name to use Flann-based matcher instead bruteforce.
 
-  Press left mouse button on a feature point to see its mathcing point.
+  Press left mouse button on a feature point to see its matching point.
 '''
 
 import numpy as np