diff --git a/3rdparty/ippicv/.gitignore b/3rdparty/ippicv/.gitignore
new file mode 100644
index 000000000..c7decd241
--- /dev/null
+++ b/3rdparty/ippicv/.gitignore
@@ -0,0 +1,4 @@
+downloads/
+macosx/
+linux/
+windows/
diff --git a/3rdparty/ippicv/downloader.cmake b/3rdparty/ippicv/downloader.cmake
new file mode 100644
index 000000000..849c03881
--- /dev/null
+++ b/3rdparty/ippicv/downloader.cmake
@@ -0,0 +1,100 @@
+#
+# The script downloads ICV package
+#
+# On return this will define:
+# OPENCV_ICV_PATH - path to unpacked downloaded package
+#
+
+function(_icv_downloader)
+  # Define actual ICV versions
+  if(APPLE)
+    set(OPENCV_ICV_PACKAGE_NAME "ippicv_macosx.tar.gz")
+    set(OPENCV_ICV_PACKAGE_HASH "d489e447906de7808a9a9d7e3f225f7a")
+    set(OPENCV_ICV_PLATFORM "macosx")
+  elseif(UNIX AND NOT ANDROID)
+    set(OPENCV_ICV_PACKAGE_NAME "ippicv_linux.tar.gz")
+    set(OPENCV_ICV_PACKAGE_HASH "42798c6cd6348bd40e74c425dc23338a")
+    set(OPENCV_ICV_PLATFORM "linux")
+  elseif(WIN32 AND NOT ARM)
+    set(OPENCV_ICV_PACKAGE_NAME "ippicv_windows.zip")
+    set(OPENCV_ICV_PACKAGE_HASH "2715f39ae65dc09bae3648bffe538706")
+    set(OPENCV_ICV_PLATFORM "windows")
+  else()
+    return() # Not supported
+  endif()
+
+  set(OPENCV_ICV_PATH "${CMAKE_CURRENT_LIST_DIR}/${OPENCV_ICV_PLATFORM}")
+
+  if(DEFINED OPENCV_ICV_PACKAGE_DOWNLOADED
+       AND OPENCV_ICV_PACKAGE_DOWNLOADED STREQUAL OPENCV_ICV_PACKAGE_HASH
+       AND EXISTS ${OPENCV_ICV_PATH})
+    # Package has been downloaded and checked by the previous build
+    set(OPENCV_ICV_PATH "${OPENCV_ICV_PATH}" PARENT_SCOPE)
+    return()
+  else()
+    if(EXISTS ${OPENCV_ICV_PATH})
+      message(STATUS "ICV: Removing previous unpacked package: ${OPENCV_ICV_PATH}")
+      file(REMOVE_RECURSE ${OPENCV_ICV_PATH})
+    endif()
+  endif()
+  unset(OPENCV_ICV_PACKAGE_DOWNLOADED CACHE)
+  
+  set(OPENCV_ICV_PACKAGE_ARCHIVE "${CMAKE_CURRENT_LIST_DIR}/downloads/${OPENCV_ICV_PLATFORM}-${OPENCV_ICV_PACKAGE_HASH}/${OPENCV_ICV_PACKAGE_NAME}")
+  get_filename_component(OPENCV_ICV_PACKAGE_ARCHIVE_DIR "${OPENCV_ICV_PACKAGE_ARCHIVE}" PATH)
+  if(EXISTS "${OPENCV_ICV_PACKAGE_ARCHIVE}")
+    file(MD5 "${OPENCV_ICV_PACKAGE_ARCHIVE}" archive_md5)
+    if(NOT archive_md5 STREQUAL OPENCV_ICV_PACKAGE_HASH)
+      message(WARNING "ICV: Local copy of ICV package has invalid MD5 hash: ${archive_md5} (expected: ${OPENCV_ICV_PACKAGE_HASH})")
+      file(REMOVE "${OPENCV_ICV_PACKAGE_ARCHIVE}")
+      file(REMOVE_RECURSE "${OPENCV_ICV_PACKAGE_ARCHIVE_DIR}")
+    endif()
+  endif()
+  
+  if(NOT EXISTS "${OPENCV_ICV_PACKAGE_ARCHIVE}")
+    if(NOT DEFINED OPENCV_ICV_URL)
+      if(NOT DEFINED ENV{OPENCV_ICV_URL})
+        # TODO Specify default URL after ICV publishing
+        message(STATUS "ICV: downloading URL is not specified, skip downloading")
+        return()
+      endif()
+      set(OPENCV_ICV_URL $ENV{OPENCV_ICV_URL})
+    endif()
+  
+    file(MAKE_DIRECTORY ${OPENCV_ICV_PACKAGE_ARCHIVE_DIR})
+    message(STATUS "ICV: Downloading ${OPENCV_ICV_PACKAGE_NAME}...")
+    file(DOWNLOAD "${OPENCV_ICV_URL}/${OPENCV_ICV_PACKAGE_NAME}" "${OPENCV_ICV_PACKAGE_ARCHIVE}"
+         TIMEOUT 600 STATUS __status
+         EXPECTED_MD5 ${OPENCV_ICV_PACKAGE_HASH})
+    if(NOT __status EQUAL 0)
+      message(FATAL_ERROR "ICV: Failed to download ICV package: ${OPENCV_ICV_PACKAGE_NAME}. Status=${__status}")
+    else()
+      # Don't remove this code, because EXPECTED_MD5 parameter doesn't fail "file(DOWNLOAD)" step
+      # on wrong hash 
+      file(MD5 "${OPENCV_ICV_PACKAGE_ARCHIVE}" archive_md5)
+      if(NOT archive_md5 STREQUAL OPENCV_ICV_PACKAGE_HASH)
+        message(FATAL_ERROR "ICV: Downloaded copy of ICV package has invalid MD5 hash: ${archive_md5} (expected: ${OPENCV_ICV_PACKAGE_HASH})")
+      endif()
+    endif()
+  endif()
+  
+  ocv_assert(EXISTS "${OPENCV_ICV_PACKAGE_ARCHIVE}")
+  ocv_assert(NOT EXISTS "${OPENCV_ICV_PATH}")
+  file(MAKE_DIRECTORY ${OPENCV_ICV_PATH})
+  ocv_assert(EXISTS "${OPENCV_ICV_PATH}")
+  
+  message(STATUS "ICV: Unpacking ${OPENCV_ICV_PACKAGE_NAME} to ${OPENCV_ICV_PATH}...")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E tar xz "${OPENCV_ICV_PACKAGE_ARCHIVE}"
+                  WORKING_DIRECTORY "${OPENCV_ICV_PATH}"
+                  RESULT_VARIABLE __result)
+
+  if(NOT __result EQUAL 0)
+    message(FATAL_ERROR "ICV: Failed to unpack ICV package from ${OPENCV_ICV_PACKAGE_ARCHIVE} to ${OPENCV_ICV_PATH} with error ${__result}")
+  endif()
+
+  set(OPENCV_ICV_PACKAGE_DOWNLOADED "${OPENCV_ICV_PACKAGE_HASH}" CACHE INTERNAL "ICV package hash")
+  
+  message(STATUS "ICV: Package successfully downloaded")
+  set(OPENCV_ICV_PATH "${OPENCV_ICV_PATH}" PARENT_SCOPE)
+endfunction()
+
+_icv_downloader()
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index 9cb6ed018..6ad6c2861 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -140,7 +140,7 @@ macro(ipp_detect_version)
     else()
       _ipp_not_supported("IPP ${IPP_VERSION_STR} at ${IPP_ROOT_DIR} is not supported")
     endif()
-    if(X86_64)
+    if(IPP_X64)
       _ipp_set_library_dir(${IPP_LIBRARY_DIR}/intel64)
     else()
       _ipp_set_library_dir(${IPP_LIBRARY_DIR}/ia32)
@@ -221,7 +221,12 @@ if(DEFINED ENV{OPENCV_IPP_PATH} AND NOT DEFINED IPPROOT)
   set(IPPROOT "$ENV{OPENCV_IPP_PATH}")
 endif()
 if(NOT DEFINED IPPROOT)
-  set(IPPROOT "${OpenCV_SOURCE_DIR}/3rdparty/ippicv")
+  include("${OpenCV_SOURCE_DIR}/3rdparty/ippicv/downloader.cmake")
+  if(DEFINED OPENCV_ICV_PATH)
+    set(IPPROOT "${OPENCV_ICV_PATH}")
+  else()
+    return()
+  endif()
 endif()
 
 # Try ICV
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index da9b931f5..5b4b72cca 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -1608,7 +1608,7 @@ void CV_StereoCalibrationTest::run( int )
         Mat _M1, _M2, _D1, _D2;
         vector<Mat> _R1, _R2, _T1, _T2;
         calibrateCamera( objpt, imgpt1, imgsize, _M1, _D1, _R1, _T1, 0 );
-        calibrateCamera( objpt, imgpt2, imgsize, _M2, _D2, _R2, _T1, 0 );
+        calibrateCamera( objpt, imgpt2, imgsize, _M2, _D2, _R2, _T2, 0 );
         undistortPoints( _imgpt1, _imgpt1, _M1, _D1, Mat(), _M1 );
         undistortPoints( _imgpt2, _imgpt2, _M2, _D2, Mat(), _M2 );
 
diff --git a/modules/contrib/doc/facerec/src/crop_face.py b/modules/contrib/doc/facerec/src/crop_face.py
index 6d6669d95..11472a93c 100755
--- a/modules/contrib/doc/facerec/src/crop_face.py
+++ b/modules/contrib/doc/facerec/src/crop_face.py
@@ -81,9 +81,32 @@ def CropFace(image, eye_left=(0,0), eye_right=(0,0), offset_pct=(0.2,0.2), dest_
   image = image.resize(dest_sz, Image.ANTIALIAS)
   return image
 
+def readFileNames():
+    try:
+        inFile = open('path_to_created_csv_file.csv')
+    except:
+        raise IOError('There is no file named path_to_created_csv_file.csv in current directory.')
+        return False
+
+    picPath = []
+    picIndex = []
+
+    for line in inFile.readlines():
+        if line != '':
+            fields = line.rstrip().split(';')
+            picPath.append(fields[0])
+            picIndex.append(int(fields[1]))
+
+    return (picPath, picIndex)
+
+
 if __name__ == "__main__":
-  image =  Image.open("arnie.jpg")
-  CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.1,0.1), dest_sz=(200,200)).save("arnie_10_10_200_200.jpg")
-  CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.2,0.2), dest_sz=(200,200)).save("arnie_20_20_200_200.jpg")
-  CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.3,0.3), dest_sz=(200,200)).save("arnie_30_30_200_200.jpg")
-  CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.2,0.2)).save("arnie_20_20_70_70.jpg")
+  [images, indexes]=readFileNames()
+if not os.path.exists("modified"):
+    os.makedirs("modified")
+for img in images:
+    image =  Image.open(img)
+    CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.1,0.1), dest_sz=(200,200)).save("modified/"+img.rstrip().split('/')[1]+"_10_10_200_200.jpg")
+    CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.2,0.2), dest_sz=(200,200)).save("modified/"+img.rstrip().split('/')[1]+"_20_20_200_200.jpg")
+    CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.3,0.3), dest_sz=(200,200)).save("modified/"+img.rstrip().split('/')[1]+"_30_30_200_200.jpg")
+    CropFace(image, eye_left=(252,364), eye_right=(420,366), offset_pct=(0.2,0.2)).save("modified/"+img.rstrip().split('/')[1]+"_20_20_70_70.jpg")
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 31cae3916..e9187faba 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -210,7 +210,30 @@ enum {
 #  endif
 #endif
 
+//! Suppress warning "-Wdeprecated-declarations" / C4996
 
+#if defined(_MSC_VER)
+    #define CV_DO_PRAGMA(x) __pragma(x)
+#elif defined(__GNUC__)
+    #define CV_DO_PRAGMA(x) _Pragma (#x)
+#else
+    #define CV_DO_PRAGMA(x)
+#endif
+
+#ifdef _MSC_VER
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(warning(push)) \
+    CV_DO_PRAGMA(warning(disable: 4996))
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(warning(pop))
+#elif defined __GNUC__
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(GCC diagnostic push) \
+    CV_DO_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(GCC diagnostic pop)
+#else
+#define CV_SUPPRESS_DEPRECATED_START
+#define CV_SUPPRESS_DEPRECATED_END
+#endif
 
 //! Signals an error and raises the exception.
 /*!
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 6b8368fd5..d921f7565 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -218,6 +218,9 @@ public:
     virtual void release() const;
     virtual void clear() const;
     virtual void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
+
+    void assign(const UMat& u) const;
+    void assign(const Mat& m) const;
 };
 
 
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index fdb6f9a0a..b751c5120 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -151,6 +151,10 @@ public:
 
     bool imageSupport() const;
 
+    bool imageFromBufferSupport() const;
+    uint imagePitchAlignment() const;
+    uint imageBaseAddressAlignment() const;
+
     size_t image2DMaxWidth() const;
     size_t image2DMaxHeight() const;
 
@@ -598,16 +602,31 @@ CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noAr
                                          InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
                                          InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
 
+CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m);
+
 class CV_EXPORTS Image2D
 {
 public:
     Image2D();
-    explicit Image2D(const UMat &src);
+
+    // src:     The UMat from which to get image properties and data
+    // norm:    Flag to enable the use of normalized channel data types
+    // alias:   Flag indicating that the image should alias the src UMat.
+    //          If true, changes to the image or src will be reflected in
+    //          both objects.
+    explicit Image2D(const UMat &src, bool norm = false, bool alias = false);
     Image2D(const Image2D & i);
     ~Image2D();
 
     Image2D & operator = (const Image2D & i);
 
+    // Indicates if creating an aliased image should succeed.  Depends on the
+    // underlying platform and the dimensions of the UMat.
+    static bool canCreateAlias(const UMat &u);
+
+    // Indicates if the image format is supported.
+    static bool isFormatSupported(int depth, int cn, bool norm);
+
     void* ptr() const;
 protected:
     struct Impl;
diff --git a/modules/core/include/opencv2/core/private.hpp b/modules/core/include/opencv2/core/private.hpp
index 593ee9fd5..8857fa378 100644
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -230,6 +230,15 @@ static inline IppiSize ippiSize(const cv::Size & _size)
     return size;
 }
 
+static inline IppiBorderType ippiGetBorderType(int borderTypeNI)
+{
+    return borderTypeNI == cv::BORDER_CONSTANT ? ippBorderConst :
+        borderTypeNI == cv::BORDER_WRAP ? ippBorderWrap :
+        borderTypeNI == cv::BORDER_REPLICATE ? ippBorderRepl :
+        borderTypeNI == cv::BORDER_REFLECT_101 ? ippBorderMirror :
+        borderTypeNI == cv::BORDER_REFLECT ? ippBorderMirrorR : (IppiBorderType)-1;
+}
+
 #else
 #  define IPP_VERSION_X100 0
 #endif
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 4efba4654..f7aded731 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1369,6 +1369,21 @@ void _InputArray::getUMatVector(std::vector<UMat>& umv) const
         return;
     }
 
+    if( k == UMAT )
+    {
+        UMat& v = *(UMat*)obj;
+        umv.resize(1);
+        umv[0] = v;
+        return;
+    }
+    if( k == MAT )
+    {
+        Mat& v = *(Mat*)obj;
+        umv.resize(1);
+        umv[0] = v.getUMat(accessFlags);
+        return;
+    }
+
     CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
 
@@ -2592,6 +2607,43 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
         CV_Error(Error::StsNotImplemented, "");
 }
 
+
+void _OutputArray::assign(const UMat& u) const
+{
+    int k = kind();
+    if (k == UMAT)
+    {
+        *(UMat*)obj = u;
+    }
+    else if (k == MAT)
+    {
+        u.copyTo(*(Mat*)obj); // TODO check u.getMat()
+    }
+    else
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+}
+
+
+void _OutputArray::assign(const Mat& m) const
+{
+    int k = kind();
+    if (k == UMAT)
+    {
+        m.copyTo(*(UMat*)obj); // TODO check m.getUMat()
+    }
+    else if (k == MAT)
+    {
+        *(Mat*)obj = m;
+    }
+    else
+    {
+        CV_Error(Error::StsNotImplemented, "");
+    }
+}
+
+
 static _InputOutputArray _none;
 InputOutputArray noArray() { return _none; }
 
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 24190c52c..950fa4199 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -882,7 +882,6 @@ OCL_FUNC_P(cl_mem, clCreateImage2D,
     cl_int *errcode_ret),
     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
 
-/*
 OCL_FUNC(cl_int, clGetSupportedImageFormats,
  (cl_context context,
  cl_mem_flags flags,
@@ -892,6 +891,7 @@ OCL_FUNC(cl_int, clGetSupportedImageFormats,
  cl_uint * num_image_formats),
  (context, flags, image_type, num_entries, image_formats, num_image_formats))
 
+/*
 OCL_FUNC(cl_int, clGetMemObjectInfo,
  (cl_mem memobj,
  cl_mem_info param_name,
@@ -1912,6 +1912,38 @@ bool Device::hostUnifiedMemory() const
 bool Device::imageSupport() const
 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
 
+bool Device::imageFromBufferSupport() const
+{
+    bool ret = false;
+    if (p)
+    {
+        size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
+        if (pos != String::npos)
+        {
+            ret = true;
+        }
+    }
+    return ret;
+}
+
+uint Device::imagePitchAlignment() const
+{
+#ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
+    return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
+#else
+    return 0;
+#endif
+}
+
+uint Device::imageBaseAddressAlignment() const
+{
+#ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
+    return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
+#else
+    return 0;
+#endif
+}
+
 size_t Device::image2DMaxWidth() const
 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
 
@@ -2705,9 +2737,15 @@ struct Kernel::Impl
             haveTempDstUMats = true;
     }
 
+    void addImage(const Image2D& image)
+    {
+        images.push_back(image);
+    }
+
     void finit()
     {
         cleanupUMats();
+        images.clear();
         if(e) { clReleaseEvent(e); e = 0; }
         release();
     }
@@ -2725,6 +2763,7 @@ struct Kernel::Impl
     enum { MAX_ARRS = 16 };
     UMatData* u[MAX_ARRS];
     int nu;
+    std::list<Image2D> images;
     bool haveTempDstUMats;
 };
 
@@ -2838,6 +2877,7 @@ int Kernel::set(int i, const void* value, size_t sz)
 
 int Kernel::set(int i, const Image2D& image2D)
 {
+    p->addImage(image2D);
     cl_mem h = (cl_mem)image2D.ptr();
     return set(i, &h, sizeof(h));
 }
@@ -3798,11 +3838,16 @@ public:
 
         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
 
-        if( u->refcount == 0 )
+        // FIXIT Workaround for UMat synchronization issue
+        // if( u->refcount == 0 )
         {
             if( !u->copyOnMap() )
             {
-                CV_Assert(u->data == 0);
+                if (u->data) // FIXIT Workaround for UMat synchronization issue
+                {
+                    //CV_Assert(u->hostCopyObsolete() == false);
+                    return;
+                }
                 // because there can be other map requests for the same UMat with different access flags,
                 // we use the universal (read-write) access mode.
                 cl_int retval = 0;
@@ -3844,6 +3889,10 @@ public:
 
         UMatDataAutoLock autolock(u);
 
+        // FIXIT Workaround for UMat synchronization issue
+        if(u->refcount > 0)
+            return;
+
         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
         cl_int retval = 0;
         if( !u->copyOnMap() && u->data )
@@ -4404,15 +4453,32 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
 
 #undef PROCESS_SRC
 
-/////////////////////////////////////////// Image2D ////////////////////////////////////////////////////
+
+// TODO Make this as a method of OpenCL "BuildOptions" class
+void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
+{
+    if (!buildOptions.empty())
+        buildOptions += " ";
+    int type = _m.type(), depth = CV_MAT_DEPTH(type);
+    buildOptions += format(
+            "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
+            name.c_str(), ocl::typeToStr(type),
+            name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+            name.c_str(), (int)CV_MAT_CN(type),
+            name.c_str(), (int)CV_ELEM_SIZE(type),
+            name.c_str(), (int)CV_ELEM_SIZE1(type),
+            name.c_str(), (int)depth
+            );
+}
+
 
 struct Image2D::Impl
 {
-    Impl(const UMat &src)
+    Impl(const UMat &src, bool norm, bool alias)
     {
         handle = 0;
         refcount = 1;
-        init(src);
+        init(src, norm, alias);
     }
 
     ~Impl()
@@ -4421,25 +4487,56 @@ struct Image2D::Impl
             clReleaseMemObject(handle);
     }
 
-    void init(const UMat &src)
+    static cl_image_format getImageFormat(int depth, int cn, bool norm)
+    {
+        cl_image_format format;
+        static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
+                                       CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
+        static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
+                                                CL_SNORM_INT16, -1, -1, -1, -1 };
+        static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
+
+        int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
+        int channelOrder = channelOrders[cn];
+        format.image_channel_data_type = (cl_channel_type)channelType;
+        format.image_channel_order = (cl_channel_order)channelOrder;
+        return format;
+    }
+
+    static bool isFormatSupported(cl_image_format format)
+    {
+        cl_context context = (cl_context)Context::getDefault().ptr();
+        // Figure out how many formats are supported by this context.
+        cl_uint numFormats = 0;
+        cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
+                                                CL_MEM_OBJECT_IMAGE2D, numFormats,
+                                                NULL, &numFormats);
+        AutoBuffer<cl_image_format> formats(numFormats);
+        err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
+                                         CL_MEM_OBJECT_IMAGE2D, numFormats,
+                                         formats, NULL);
+        CV_OclDbgAssert(err == CL_SUCCESS);
+        for (cl_uint i = 0; i < numFormats; ++i)
+        {
+            if (!memcmp(&formats[i], &format, sizeof(format)))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void init(const UMat &src, bool norm, bool alias)
     {
         CV_Assert(ocl::Device::getDefault().imageSupport());
 
-        cl_image_format format;
         int err, depth = src.depth(), cn = src.channels();
         CV_Assert(cn <= 4);
+        cl_image_format format = getImageFormat(depth, cn, norm);
 
-        static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
-                                       CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
-        static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
-
-        int channelType = channelTypes[depth], channelOrder = channelOrders[cn];
-        if (channelType < 0 || channelOrder < 0)
+        if (!isFormatSupported(format))
             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
 
-        format.image_channel_data_type = (cl_channel_type)channelType;
-        format.image_channel_order = (cl_channel_order)channelOrder;
-
         cl_context context = (cl_context)Context::getDefault().ptr();
         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
 
@@ -4448,6 +4545,7 @@ struct Image2D::Impl
         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
         const Device & d = ocl::Device::getDefault();
         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
+        CV_Assert(!alias || canCreateAlias(src));
         if (1 < major || (1 == major && 2 <= minor))
         {
             cl_image_desc desc;
@@ -4456,9 +4554,9 @@ struct Image2D::Impl
             desc.image_height     = src.rows;
             desc.image_depth      = 0;
             desc.image_array_size = 1;
-            desc.image_row_pitch  = 0;
+            desc.image_row_pitch  = alias ? src.step[0] : 0;
             desc.image_slice_pitch = 0;
-            desc.buffer           = NULL;
+            desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
             desc.num_mip_levels   = 0;
             desc.num_samples      = 0;
             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
@@ -4466,7 +4564,10 @@ struct Image2D::Impl
         else
 #endif
         {
+            CV_SUPPRESS_DEPRECATED_START
+            CV_Assert(!alias);  // This is an OpenCL 1.2 extension
             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
+            CV_SUPPRESS_DEPRECATED_END
         }
         CV_OclDbgAssert(err == CL_SUCCESS);
 
@@ -4474,7 +4575,7 @@ struct Image2D::Impl
         size_t region[] = { src.cols, src.rows, 1 };
 
         cl_mem devData;
-        if (!src.isContinuous())
+        if (!alias && !src.isContinuous())
         {
             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
             CV_OclDbgAssert(err == CL_SUCCESS);
@@ -4485,14 +4586,19 @@ struct Image2D::Impl
             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
         }
         else
+        {
             devData = (cl_mem)src.handle(ACCESS_READ);
+        }
         CV_Assert(devData != NULL);
 
-        CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
-        if (!src.isContinuous())
+        if (!alias)
         {
-            CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
-            CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
+            CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
+            if (!src.isContinuous())
+            {
+                CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
+                CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
+            }
         }
     }
 
@@ -4506,9 +4612,37 @@ Image2D::Image2D()
     p = NULL;
 }
 
-Image2D::Image2D(const UMat &src)
+Image2D::Image2D(const UMat &src, bool norm, bool alias)
 {
-    p = new Impl(src);
+    p = new Impl(src, norm, alias);
+}
+
+bool Image2D::canCreateAlias(const UMat &m)
+{
+    bool ret = false;
+    const Device & d = ocl::Device::getDefault();
+    if (d.imageFromBufferSupport())
+    {
+        // This is the required pitch alignment in pixels
+        uint pitchAlign = d.imagePitchAlignment();
+        if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
+        {
+            // We don't currently handle the case where the buffer was created
+            // with CL_MEM_USE_HOST_PTR
+            if (!m.u->tempUMat())
+            {
+                ret = true;
+            }
+        }
+    }
+    return ret;
+}
+
+bool Image2D::isFormatSupported(int depth, int cn, bool norm)
+{
+    cl_image_format format = Impl::getImageFormat(depth, cn, norm);
+
+    return Impl::isFormatSupported(format);
 }
 
 Image2D::Image2D(const Image2D & i)
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 6d590f907..66866abb8 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -692,11 +692,11 @@ int cv::countNonZero( InputArray _src )
 
     Mat src = _src.getMat();
 
-#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && 0
     if (src.dims <= 2 || src.isContinuous())
     {
         IppiSize roiSize = { src.cols, src.rows };
-        Ipp32s count, srcstep = (Ipp32s)src.step;
+        Ipp32s count = 0, srcstep = (Ipp32s)src.step;
         IppStatus status = (IppStatus)-1;
 
         if (src.isContinuous())
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 2527e5397..1c038b942 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -1361,7 +1361,6 @@ TEST_P(ElemWiseTest, accuracy)
         op->op(src, dst, mask);
 
         double maxErr = op->getMaxErr(depth);
-        vector<int> pos;
         ASSERT_PRED_FORMAT2(cvtest::MatComparator(maxErr, op->context), dst0, dst) << "\nsrc[0] ~ " <<
             cvtest::MatInfo(!src.empty() ? src[0] : Mat()) << "\ntestCase #" << testIdx << "\n";
     }
diff --git a/modules/features2d/src/matchers.cpp b/modules/features2d/src/matchers.cpp
index 6bc925f8a..2b5605031 100644
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
@@ -330,7 +330,7 @@ static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat
 static bool ocl_kmatchDispatcher(InputArray query, InputArray train, const UMat &trainIdx,
                                  const UMat &distance, int distType)
 {
-        return ocl_match2Dispatcher(query, train, trainIdx, distance, distType);
+    return ocl_match2Dispatcher(query, train, trainIdx, distance, distType);
 }
 
 static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
@@ -1209,8 +1209,8 @@ FlannBasedMatcher::FlannBasedMatcher( const Ptr<flann::IndexParams>& _indexParam
 void FlannBasedMatcher::add( InputArrayOfArrays _descriptors )
 {
     DescriptorMatcher::add( _descriptors );
-    std::vector<Mat> descriptors;
-    _descriptors.getMatVector(descriptors);
+    std::vector<UMat> descriptors;
+    _descriptors.getUMatVector(descriptors);
 
     for( size_t i = 0; i < descriptors.size(); i++ )
     {
@@ -1232,6 +1232,13 @@ void FlannBasedMatcher::train()
 {
     if( !flannIndex || mergedDescriptors.size() < addedDescCount )
     {
+        // FIXIT: Workaround for 'utrainDescCollection' issue (PR #2142)
+        if (!utrainDescCollection.empty())
+        {
+            CV_Assert(trainDescCollection.size() == 0);
+            for (size_t i = 0; i < utrainDescCollection.size(); ++i)
+                trainDescCollection.push_back(utrainDescCollection[i].getMat(ACCESS_READ));
+        }
         mergedDescriptors.set( trainDescCollection );
         flannIndex = makePtr<flann::Index>( mergedDescriptors.getDescriptors(), *indexParams );
     }
diff --git a/modules/features2d/test/test_nearestneighbors.cpp b/modules/features2d/test/test_nearestneighbors.cpp
index 3d80b0db1..0c2c70b7d 100644
--- a/modules/features2d/test/test_nearestneighbors.cpp
+++ b/modules/features2d/test/test_nearestneighbors.cpp
@@ -215,8 +215,6 @@ int CV_KDTreeTest_CPP::findNeighbors( Mat& points, Mat& neighbors )
     const int emax = 20;
     Mat neighbors2( neighbors.size(), CV_32SC1 );
     int j;
-    vector<float> min(points.cols, static_cast<float>(minValue));
-    vector<float> max(points.cols, static_cast<float>(maxValue));
     for( int pi = 0; pi < points.rows; pi++ )
     {
         // 1st way
diff --git a/modules/imgproc/perf/opencl/perf_filters.cpp b/modules/imgproc/perf/opencl/perf_filters.cpp
index f7329e319..d4dfbe0cc 100644
--- a/modules/imgproc/perf/opencl/perf_filters.cpp
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@@ -112,7 +112,7 @@ OCL_PERF_TEST_P(LaplacianFixture, Laplacian,
     const FilterParams params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params), ksize = get<2>(params);
-    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 2e-5;
 
     checkDeviceMaxMemoryAllocSize(srcSize, type);
 
diff --git a/modules/imgproc/perf/perf_matchTemplate.cpp b/modules/imgproc/perf/perf_matchTemplate.cpp
index a1f0ddeab..bf5e687fa 100644
--- a/modules/imgproc/perf/perf_matchTemplate.cpp
+++ b/modules/imgproc/perf/perf_matchTemplate.cpp
@@ -42,7 +42,7 @@ PERF_TEST_P(ImgSize_TmplSize_Method, matchTemplateSmall,
         method == TM_CCORR_NORMED ||
         method == TM_SQDIFF_NORMED ||
         method == TM_CCOEFF_NORMED;
-    double eps = isNormed ? 1e-6
+    double eps = isNormed ? 1e-5
         : 255 * 255 * tmpl.total() * 1e-6;
 
     SANITY_CHECK(result, eps);
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index ce70ea5c6..1c9310e41 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -374,6 +374,18 @@ static ippiGeneralFunc ippiHLS2RGBTab[] =
     0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
 };
 
+static ippiGeneralFunc ippiRGBToLUVTab[] =
+{
+    (ippiGeneralFunc)ippiRGBToLUV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToLUV_16u_C3R, 0,
+    0, (ippiGeneralFunc)ippiRGBToLUV_32f_C3R, 0, 0
+};
+
+static ippiGeneralFunc ippiLUVToRGBTab[] =
+{
+    (ippiGeneralFunc)ippiLUVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiLUVToRGB_16u_C3R, 0,
+    0, (ippiGeneralFunc)ippiLUVToRGB_32f_C3R, 0, 0
+};
+
 struct IPPGeneralFunctor
 {
     IPPGeneralFunctor(ippiGeneralFunc _func) : func(_func){}
@@ -3323,6 +3335,34 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(sz, CV_8UC2);
             dst = _dst.getMat();
 
+#if defined HAVE_IPP
+            CV_SUPPRESS_DEPRECATED_START
+            if (code == CV_BGR2BGR565 && scn == 3)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
+                    return;
+            }
+            else if (code == CV_BGRA2BGR565)
+            {
+                if (CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                           (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
+                    return;
+            }
+            else if (code == CV_RGB2BGR565)
+            {
+                if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                                           (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
+                    return;
+            }
+            else if (code == CV_RGBA2BGR565)
+            {
+                if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                           (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
+                    return;
+            }
+            CV_SUPPRESS_DEPRECATED_END
+#endif
+
 #ifdef HAVE_TEGRA_OPTIMIZATION
             if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
                 if(tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
@@ -3344,6 +3384,34 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
 
+#ifdef HAVE_IPP
+            CV_SUPPRESS_DEPRECATED_START
+            if (code == CV_BGR5652BGR)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
+                    return;
+            }
+            else if (code == CV_BGR5652RGB)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
+                                                                       ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
+                    return;
+            }
+            if (code == CV_BGR5652BGRA)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
+                                                                       ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
+                    return;
+            }
+            else if (code == CV_BGR5652RGBA)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
+                                                                       ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
+                    return;
+            }
+            CV_SUPPRESS_DEPRECATED_END
+#endif
+
             CvtColorLoop(src, dst, RGB5x52RGB(dcn,
                       code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
                       code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
@@ -3457,6 +3525,32 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(sz, CV_MAKETYPE(depth, 3));
             dst = _dst.getMat();
 
+#if defined HAVE_IPP && 0
+            if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
+                    return;
+            }
+            else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                                       (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
+                    return;
+            }
+            else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                       (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
+                    return;
+            }
+            else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                       (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
+                    return;
+            }
+#endif
+
             if( depth == CV_8U )
             {
 #ifdef HAVE_TEGRA_OPTIMIZATION
@@ -3486,6 +3580,32 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
 
+#if defined HAVE_IPP && 0
+            if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
+                    return;
+            }
+            else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
+                                                                       ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
+                    return;
+            }
+            else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
+                                                                       ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
+                    return;
+            }
+            else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
+                                                                       ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
+                    return;
+            }
+#endif
+
             if( depth == CV_8U )
                 CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
             else if( depth == CV_16U )
@@ -3741,6 +3861,55 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(sz, CV_MAKETYPE(depth, 3));
             dst = _dst.getMat();
 
+#ifdef HAVE_IPP
+            if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
+                    return;
+            }
+            else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                       (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
+                    return;
+            }
+            else if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                                       (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
+                    return;
+            }
+            else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                       (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
+                    return;
+            }
+            else if (code == CV_LRGB2Luv && scn == 3)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
+                    return;
+            }
+            else if (code == CV_LRGB2Luv && scn == 4)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                       ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
+                    return;
+            }
+            else if (code == CV_LBGR2Luv && scn == 3)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
+                                                                       ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
+                    return;
+            }
+            else if (code == CV_LBGR2Luv && scn == 4)
+            {
+                if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
+                                                                       ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
+                    return;
+            }
+#endif
+
             if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
                 code == CV_LBGR2Lab || code == CV_LRGB2Lab )
             {
@@ -3772,6 +3941,57 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             _dst.create(sz, CV_MAKETYPE(depth, dcn));
             dst = _dst.getMat();
 
+#if defined (HAVE_IPP)
+#if 0
+            if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
+                    return;
+            }
+            else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
+                                    ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                    return;
+            }
+            if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
+                                                                           ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
+                                                                       ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+#endif
+            if( code == CV_Luv2LRGB && dcn == 3 )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
+                    return;
+            }
+            else if( code == CV_Luv2LRGB && dcn == 4 )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
+                                                                       ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
+                    return;
+            }
+            if( code == CV_Luv2LBGR && dcn == 3 )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
+                                                                       ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+            else if( code == CV_Luv2LBGR && dcn == 4 )
+            {
+                if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
+                                                                       ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
+                    return;
+            }
+#endif
+
             if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
                 code == CV_Lab2LBGR || code == CV_Lab2LRGB )
             {
@@ -3968,7 +4188,13 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 dst = _dst.getMat();
 
                 if( depth == CV_8U )
+                {
+#ifdef HAVE_IPP
+                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
+                        return;
+#endif
                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
+                }
                 else
                 {
                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 1b3e2c417..326f86cd4 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -577,6 +577,64 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
         ddepth = sdepth;
     _dst.create( _src.size(), CV_MAKETYPE(ddepth, cn) );
 
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if ((ksize == 3 || ksize == 5) && ((borderType & BORDER_ISOLATED) != 0 || !_src.isSubmatrix()) &&
+        ((stype == CV_8UC1 && ddepth == CV_16S) || (ddepth == CV_32F && stype == CV_32FC1)))
+    {
+        int iscale = saturate_cast<int>(scale), idelta = saturate_cast<int>(delta);
+        bool floatScale = std::fabs(scale - iscale) > DBL_EPSILON, needScale = iscale != 1;
+        bool floatDelta = std::fabs(delta - idelta) > DBL_EPSILON, needDelta = delta != 0;
+        int borderTypeNI = borderType & ~BORDER_ISOLATED;
+        Mat src = _src.getMat(), dst = _dst.getMat();
+
+        if (src.data != dst.data)
+        {
+            Ipp32s bufsize;
+            IppStatus status = (IppStatus)-1;
+            IppiSize roisize = { src.cols, src.rows };
+            IppiMaskSize masksize = ksize == 3 ? ippMskSize3x3 : ippMskSize5x5;
+            IppiBorderType borderTypeIpp = ippiGetBorderType(borderTypeNI);
+
+#define IPP_FILTER_LAPLACIAN(ippsrctype, ippdsttype, ippfavor) \
+    do \
+    { \
+        if (borderTypeIpp >= 0 && ippiFilterLaplacianGetBufferSize_##ippfavor##_C1R(roisize, masksize, &bufsize) >= 0) \
+        { \
+            Ipp8u * buffer = ippsMalloc_8u(bufsize); \
+            status = ippiFilterLaplacianBorder_##ippfavor##_C1R((const ippsrctype *)src.data, (int)src.step, (ippdsttype *)dst.data, \
+                                                                (int)dst.step, roisize, masksize, borderTypeIpp, 0, buffer); \
+            ippsFree(buffer); \
+        } \
+    } while ((void)0, 0)
+
+            CV_SUPPRESS_DEPRECATED_START
+            if (sdepth == CV_8U && ddepth == CV_16S && !floatScale && !floatDelta)
+            {
+                IPP_FILTER_LAPLACIAN(Ipp8u, Ipp16s, 8u16s);
+
+                if (needScale && status >= 0)
+                    status = ippiMulC_16s_C1IRSfs((Ipp16s)iscale, (Ipp16s *)dst.data, (int)dst.step, roisize, 0);
+                if (needDelta && status >= 0)
+                    status = ippiAddC_16s_C1IRSfs((Ipp16s)idelta, (Ipp16s *)dst.data, (int)dst.step, roisize, 0);
+            }
+            else if (sdepth == CV_32F && ddepth == CV_32F)
+            {
+                IPP_FILTER_LAPLACIAN(Ipp32f, Ipp32f, 32f);
+
+                if (needScale && status >= 0)
+                    status = ippiMulC_32f_C1IR((Ipp32f)scale, (Ipp32f *)dst.data, (int)dst.step, roisize);
+                if (needDelta && status >= 0)
+                    status = ippiAddC_32f_C1IR((Ipp32f)delta, (Ipp32f *)dst.data, (int)dst.step, roisize);
+            }
+            CV_SUPPRESS_DEPRECATED_END
+
+            if (status >= 0)
+                return;
+        }
+    }
+#undef IPP_FILTER_LAPLACIAN
+#endif
+
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if (scale == 1.0 && delta == 0)
     {
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index c9a5ed1e7..1af9e9d2f 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -1413,14 +1413,14 @@ struct RowVec_32f
     {
         kernel = _kernel;
         haveSSE = checkHardwareSupport(CV_CPU_SSE);
-#ifdef USE_IPP_SEP_FILTERS
+#if defined USE_IPP_SEP_FILTERS && 0
         bufsz = -1;
 #endif
     }
 
     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
     {
-#ifdef USE_IPP_SEP_FILTERS
+#if defined USE_IPP_SEP_FILTERS && 0
         int ret = ippiOperator(_src, _dst, width, cn);
         if (ret > 0)
             return ret;
@@ -1458,13 +1458,13 @@ struct RowVec_32f
 
     Mat kernel;
     bool haveSSE;
-#ifdef USE_IPP_SEP_FILTERS
+#if defined USE_IPP_SEP_FILTERS && 0
 private:
     mutable int bufsz;
     int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const
     {
         int _ksize = kernel.rows + kernel.cols - 1;
-//        if ((1 != cn && 3 != cn) || width < _ksize*8)
+        if ((1 != cn && 3 != cn) || width < _ksize*8)
             return 0;
 
         const float* src = (const float*)_src;
@@ -3280,12 +3280,15 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
     return k.run(2, globalsize, localsize, false);
 }
 
+const int shift_bits = 8;
+
 static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
-                               int borderType, int ddepth, bool fast8uc1)
+                               int borderType, int ddepth, bool fast8uc1, bool int_arithm)
 {
     int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
     Size bufSize = buf.size();
+    int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
 
     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
         return false;
@@ -3313,15 +3316,16 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
 
     char cvt[40];
     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
-                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
+                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
                                           radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
                                           extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
                                           isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
-                                          ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)),
-                                          ocl::convertTypeStr(sdepth, CV_32F, cn, cvt),
-                                          ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F),
-                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-    build_options += ocl::kernelToStr(kernelX, CV_32F);
+                                          ocl::typeToStr(type), ocl::typeToStr(buf_type),
+                                          ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
+                                          ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
+                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                                          int_arithm ? " -D INTEGER_ARITHMETIC" : "");
+    build_options += ocl::kernelToStr(kernelX, bdepth);
 
     Size srcWholeSize; Point srcOffset;
     src.locateROI(srcWholeSize, srcOffset);
@@ -3348,7 +3352,7 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
     return k.run(2, globalsize, localsize, false);
 }
 
-static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor)
+static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
 {
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
     if (dst.depth() == CV_64F && !doubleSupport)
@@ -3363,6 +3367,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
 
     int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
     Size sz = dst.size();
+    int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
 
     globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
     globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
@@ -3370,13 +3375,14 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
     char cvt[40];
     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
-                                          " -D srcT1=%s -D dstT1=%s%s",
+                                          " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
                                           anchor, (int)localsize[0], (int)localsize[1], cn,
-                                          ocl::typeToStr(buf.type()), ocl::typeToStr(dtype),
-                                          ocl::convertTypeStr(CV_32F, ddepth, cn, cvt),
-                                          ocl::typeToStr(CV_32F), ocl::typeToStr(ddepth),
-                                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-    build_options += ocl::kernelToStr(kernelY, CV_32F);
+                                          ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
+                                          ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
+                                          ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
+                                          2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                                          int_arithm ? " -D INTEGER_ARITHMETIC" : "");
+    build_options += ocl::kernelToStr(kernelY, bdepth);
 
     ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
                   build_options);
@@ -3393,12 +3399,12 @@ const int optimizedSepFilterLocalSize = 16;
 
 static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
                                        Mat row_kernel, Mat col_kernel,
-                                       double delta, int borderType, int ddepth)
+                                       double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
 {
     Size size = _src.size(), wholeSize;
     Point origin;
     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
-            esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
+            esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
             dtype = CV_MAKE_TYPE(ddepth, cn);
     size_t src_step = _src.step(), src_offset = _src.offset();
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
@@ -3418,14 +3424,15 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
 
     String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
                              " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
-                             " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d", (int)lt2[0], (int)lt2[1],
-                             row_kernel.cols / 2, col_kernel.cols / 2,
-                             ocl::kernelToStr(row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(),
-                             ocl::kernelToStr(col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(),
+                             " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
+                             (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
+                             ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
+                             ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
                              ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
-                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn);
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth),
+                             cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : "");
 
     ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
     if (k.empty())
@@ -3468,19 +3475,37 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
     if (ddepth < 0)
         ddepth = sdepth;
 
-    CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
-                imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) &&
-                imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) &&
-                (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) &&
-                (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
-                ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
-                                           borderType & ~BORDER_ISOLATED, ddepth), true)
-
     if (anchor.x < 0)
         anchor.x = kernelX.cols >> 1;
     if (anchor.y < 0)
         anchor.y = kernelY.cols >> 1;
 
+    int rtype = getKernelType(kernelX,
+        kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
+    int ctype = getKernelType(kernelY,
+        kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
+
+    int bdepth = CV_32F;
+    bool int_arithm = false;
+    if( sdepth == CV_8U && ddepth == CV_8U &&
+        rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
+        ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
+    {
+        bdepth = CV_32S;
+        kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
+        kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
+        int_arithm = true;
+    }
+
+    CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
+                imgSize.width > optimizedSepFilterLocalSize + anchor.x &&
+                imgSize.height > optimizedSepFilterLocalSize + anchor.y &&
+                (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
+                anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
+                (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
+                ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
+                                           borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
+
     UMat src = _src.getUMat();
     Size srcWholeSize; Point srcOffset;
     src.locateROI(srcWholeSize, srcOffset);
@@ -3490,14 +3515,14 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
 
     Size srcSize = src.size();
     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
-    UMat buf(bufSize, CV_32FC(cn));
-    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
+    UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
         return false;
 
     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
     UMat dst = _dst.getUMat();
 
-    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y);
+    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
 }
 
 #endif
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 8e8484d6b..ea7354771 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1242,7 +1242,7 @@ void cv::calcHist( const Mat* images, int nimages, const int* channels,
 
         bool ok = true;
         const Mat & src = images[0];
-        int nstripes = std::min<int>(8, src.total() / (1 << 16));
+        int nstripes = std::min<int>(8, static_cast<int>(src.total() / (1 << 16)));
 #ifdef HAVE_CONCURRENCY
         nstripes = 1;
 #endif
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 45a66bd83..03a9d79a3 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2037,15 +2037,6 @@ static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int *
     ofs_tab[dx] = k;
 }
 
-static void ocl_computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol)
-{
-    for (int i = 0; i < dcols; ++i)
-        dmap_tab[i] = scale * i;
-
-    for (int i = 0, size = dcols * scale; i < size; ++i)
-        smap_tab[i] = std::min(scol - 1, i);
-}
-
 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
                         double fx, double fy, int interpolation)
 {
@@ -2075,7 +2066,39 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     ocl::Kernel k;
     size_t globalsize[] = { dst.cols, dst.rows };
 
-    if (interpolation == INTER_LINEAR)
+    ocl::Image2D srcImage;
+
+    // See if this could be done with a sampler.  We stick with integer
+    // datatypes because the observed error is low.
+    bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
+                       ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
+                       ocl::Image2D::isFormatSupported(depth, cn, true));
+    if (useSampler)
+    {
+        int wdepth = std::max(depth, CV_32S);
+        char buf[2][32];
+        cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
+                        "-D convertToDT=%s -D cn=%d",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(depth),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                        cn);
+        k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
+
+        if(k.empty())
+        {
+            useSampler = false;
+        }
+        else
+        {
+            // Convert the input into an OpenCL image type, using normalized channel data types
+            // and aliasing the UMat.
+            srcImage = ocl::Image2D(src, true, true);
+            k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
+                   (float)inv_fx, (float)inv_fy);
+        }
+    }
+
+    if (interpolation == INTER_LINEAR && !useSampler)
     {
         char buf[2][32];
 
@@ -2180,25 +2203,14 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         {
             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
-                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
-                                               ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
-                                               ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
-                                  iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
+                                                " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
+                                                ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
+                                                ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
+                                    iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
 
             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
             if (k.empty())
                 return false;
-
-            int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y;
-            AutoBuffer<int> dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size);
-            int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols;
-            int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y;
-
-            ocl_computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols);
-            ocl_computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows);
-
-            Mat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab).copyTo(dmap);
-            Mat(1, smap_tab_size, CV_32SC1, (void *)smap_tab).copyTo(smap);
         }
         else
         {
@@ -2228,7 +2240,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
 
         if (is_area_fast)
-            k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(dmap), ocl::KernelArg::PtrReadOnly(smap));
+            k.args(srcarg, dstarg);
         else
             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index f3fb6962d..8e0749776 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -466,7 +466,7 @@ cv::Moments cv::moments( InputArray _src, bool binary )
         if( cn > 1 )
             CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" );
 
-#if (IPP_VERSION_X100 >= 801)
+#if IPP_VERSION_X100 >= 801 && !defined HAVE_IPP_ICV_ONLY
         if (!binary)
         {
             IppiSize roi = {mat.cols, mat.rows};
@@ -474,6 +474,7 @@ cv::Moments cv::moments( InputArray _src, bool binary )
             // ippiMomentInitAlloc_64f, ippiMomentFree_64f are deprecated in 8.1, but there are not another way
             // to initialize IppiMomentState_64f. When GetStateSize and Init functions will appear we have to
             // change our code.
+            CV_SUPPRESS_DEPRECATED_START
             if (0 <= ippiMomentInitAlloc_64f(&moment, ippAlgHintAccurate))
             {
                 IppStatus sts = (IppStatus)(-1);
@@ -518,6 +519,7 @@ cv::Moments cv::moments( InputArray _src, bool binary )
                 }
                 ippiMomentFree_64f(moment);
             }
+            CV_SUPPRESS_DEPRECATED_END
         }
 #endif
 
diff --git a/modules/imgproc/src/opencl/filterSepCol.cl b/modules/imgproc/src/opencl/filterSepCol.cl
index 29514cc21..13595058f 100644
--- a/modules/imgproc/src/opencl/filterSepCol.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -60,7 +61,7 @@
 #endif
 
 #define DIG(a) a,
-__constant float mat_kernel[] = { COEFF };
+__constant srcT1 mat_kernel[] = { COEFF };
 
 __kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
@@ -96,9 +97,17 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse
     {
         temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
         temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
+#ifndef INTEGER_ARITHMETIC
         sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
+#else
+        sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
+#endif
     }
 
+#ifdef INTEGER_ARITHMETIC
+    sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
+#endif
+
     // write the result to dst
     if (x < dst_cols && y < dst_rows)
     {
diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl
index 726de448e..472ac4c91 100644
--- a/modules/imgproc/src/opencl/filterSepRow.cl
+++ b/modules/imgproc/src/opencl/filterSepRow.cl
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -138,7 +139,15 @@
 #endif
 
 #define DIG(a) a,
-__constant float mat_kernel[] = { COEFF };
+__constant dstT1 mat_kernel[] = { COEFF };
+
+#ifndef INTEGER_ARITHMETIC
+#define dstT4 float4
+#define convertDstVec convert_float4
+#else
+#define dstT4 int4
+#define convertDstVec convert_int4
+#endif
 
 __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
                                int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
@@ -155,7 +164,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
     int start_y = y + src_offset_y - radiusy;
     int start_addr = mad24(start_y, src_step_in_pixel, start_x);
 
-    float4 sum;
+    dstT4 sum;
     uchar4 temp[READ_TIMES_ROW];
 
     __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
@@ -249,19 +258,23 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // read pixels from lds and calculate the result
-    sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
+    sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
     for (int i = 1; i <= RADIUSX; ++i)
     {
         temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
         temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
-        sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]);
+#ifndef INTEGER_ARITHMETIC
+        sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
+#else
+        sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
+#endif
     }
 
     start_addr = mad24(y, dst_step_in_pixel, x);
 
     // write the result to dst
     if ((x+3<dst_cols) & (y<dst_rows))
-        *(__global float4*)&dst[start_addr] = sum;
+        *(__global dstT4*)&dst[start_addr] = sum;
     else if ((x+2<dst_cols) && (y<dst_rows))
     {
         dst[start_addr] = sum.x;
@@ -355,7 +368,11 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse
     {
         temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
         temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
+#ifndef INTEGER_ARITHMETIC
         sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
+#else
+        sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
+#endif
     }
 
     // write the result to dst
diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl
index e75574035..b8b812df4 100644
--- a/modules/imgproc/src/opencl/filterSep_singlePass.cl
+++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl
@@ -100,8 +100,8 @@
 // horizontal and vertical filter kernels
 // should be defined on host during compile time to avoid overhead
 #define DIG(a) a,
-__constant float mat_kernelX[] = { KERNEL_MATRIX_X };
-__constant float mat_kernelY[] = { KERNEL_MATRIX_Y };
+__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };
+__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };
 
 __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
                          __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
@@ -124,8 +124,6 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
     // calculate pixel position in source image taking image offset into account
     int srcX = x + srcOffsetX - RADIUSX;
     int srcY = y + srcOffsetY - RADIUSY;
-    int xb = srcX;
-    int yb = srcY;
 
     // extrapolate coordinates, if needed
     // and read my own source pixel into local memory
@@ -159,12 +157,16 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
     // do vertical filter pass
     // and store intermediate results to second local memory array
     int i, clocX = lix;
-    WT sum = 0.0f;
+    WT sum = (WT) 0;
     do
     {
-        sum = 0.0f;
+        sum = (WT) 0;
         for (i=0; i<=2*RADIUSY; i++)
+#ifndef INTEGER_ARITHMETIC
             sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
+#else
+            sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
+#endif
         lsmemDy[liy][clocX] = sum;
         clocX += BLK_X;
     }
@@ -180,7 +182,13 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
     // and calculate final result
     sum = 0.0f;
     for (i=0; i<=2*RADIUSX; i++)
+#ifndef INTEGER_ARITHMETIC
         sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
+#else
+        sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
+
+    sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
+#endif
 
     // store result into destination image
     storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index d656bf6d1..55ef06994 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -67,7 +67,64 @@
 #define TSIZE (int)sizeof(T1)*cn
 #endif
 
-#ifdef INTER_LINEAR_INTEGER
+#if defined USE_SAMPLER
+
+#if cn == 1
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x
+#elif cn == 2
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy
+#elif cn == 3
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz
+#elif cn == 4
+#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)
+#endif
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+#define INTERMEDIATE_TYPE CAT(float, cn)
+#define float1 float
+
+#if depth == 0
+#define RESULT_SCALE    255.0f
+#elif depth == 1
+#define RESULT_SCALE    127.0f
+#elif depth == 2
+#define RESULT_SCALE    65535.0f
+#elif depth == 3
+#define RESULT_SCALE    32767.0f
+#else
+#define RESULT_SCALE    1.0f
+#endif
+
+__kernel void resizeSampler(__read_only image2d_t srcImage,
+                            __global uchar* dstptr, int dststep, int dstoffset,
+                            int dstrows, int dstcols,
+                            float ifx, float ify)
+{
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+                              CLK_ADDRESS_CLAMP_TO_EDGE |
+                              CLK_FILTER_LINEAR;
+
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);
+
+    INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));
+
+#if depth <= 4
+    T uval = convertToDT(round(intermediate * RESULT_SCALE));
+#else
+    T uval = convertToDT(intermediate * RESULT_SCALE);
+#endif
+
+    if(dx < dstcols && dy < dstrows)
+    {
+        storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));
+    }
+}
+
+#elif defined INTER_LINEAR_INTEGER
 
 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
@@ -185,8 +242,7 @@ __kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offs
 #ifdef INTER_AREA_FAST
 
 __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
-                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                              __global const int * dmap_tab, __global const int * smap_tab)
+                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -195,21 +251,21 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
     {
         int dst_index = mad24(dy, dst_step, dst_offset);
 
-        __global const int * xmap_tab = dmap_tab;
-        __global const int * ymap_tab = dmap_tab + dst_cols;
-        __global const int * sxmap_tab = smap_tab;
-        __global const int * symap_tab = smap_tab + XSCALE * dst_cols;
-
-        int sx = xmap_tab[dx], sy = ymap_tab[dy];
+        int sx = XSCALE * dx;
+        int sy = YSCALE * dy;
         WTV sum = (WTV)(0);
 
         #pragma unroll
-        for (int y = 0; y < YSCALE; ++y)
+        for (int py = 0; py < YSCALE; ++py)
         {
-            int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
+            int y = min(sy + py, src_rows - 1);
+            int src_index = mad24(y, src_step, src_offset);
             #pragma unroll
-            for (int x = 0; x < XSCALE; ++x)
-                sum += convertToWTV(loadpix(src + mad24(sxmap_tab[sx + x], TSIZE, src_index)));
+            for (int px = 0; px < XSCALE; ++px)
+            {
+                int x = min(sx + px, src_cols - 1);
+                sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
+            }
         }
 
         storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index d3fb92ebb..75ae2f7cc 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -341,7 +341,7 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _
 
 #endif
 
-#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+#if defined HAVE_IPP && IPP_VERSION_MAJOR >= 7 && !defined HAVE_IPP_ICV_ONLY
 
 typedef IppStatus (CV_STDCALL * ippimatchTemplate)(const void*, int, IppiSize, const void*, int, IppiSize, Ipp32f* , int , IppEnum , Ipp8u*);
 
@@ -423,11 +423,6 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 Size corrsize, int ctype,
                 Point anchor, double delta, int borderType )
 {
-#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
-    if (ipp_crossCorr(img, _templ, corr))
-        return;
-#endif
-
     const double blockScale = 4.5;
     const int minBlockSize = 256;
     std::vector<uchar> buf;
@@ -643,12 +638,16 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result,
         return;
 #endif
 
-#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
+#if defined HAVE_IPP && IPP_VERSION_MAJOR >= 7 && !defined HAVE_IPP_ICV_ONLY
     if (method == CV_TM_SQDIFF && ipp_sqrDistance(img, templ, result))
         return;
 #endif
 
     int cn = img.channels();
+
+#if defined HAVE_IPP && IPP_VERSION_MAJOR >= 7 && !defined HAVE_IPP_ICV_ONLY
+    if (!ipp_crossCorr(img, templ, result))
+#endif
     crossCorr( img, templ, result, result.size(), result.type(), Point(0,0), 0, 0);
 
     if( method == CV_TM_CCORR )
diff --git a/modules/imgproc/test/ocl/test_color.cpp b/modules/imgproc/test/ocl/test_color.cpp
index f0cf560bb..62962002f 100644
--- a/modules/imgproc/test/ocl/test_color.cpp
+++ b/modules/imgproc/test/ocl/test_color.cpp
@@ -158,7 +158,7 @@ OCL_TEST_P(CvtColor, YCrCb2BGRA) { performTest(3, 4, CVTCODE(YCrCb2BGR)); }
 #if IPP_VERSION_X100 > 0
 #define IPP_EPS depth <= CV_32S ? 1 : 4e-5
 #else
-#define IPP_EPS 0
+#define IPP_EPS 1e-3
 #endif
 
 OCL_TEST_P(CvtColor, RGB2XYZ) { performTest(3, 3, CVTCODE(RGB2XYZ), IPP_EPS); }
@@ -261,19 +261,27 @@ OCL_TEST_P(CvtColor8u, GRAY2BGR555) { performTest(1, 2, CVTCODE(GRAY2BGR555)); }
 
 // RGBA <-> mRGBA
 
-OCL_TEST_P(CvtColor8u, RGBA2mRGBA) { performTest(4, 4, CVTCODE(RGBA2mRGBA)); }
+#if IPP_VERSION_X100 > 0
+#define IPP_EPS depth <= CV_32S ? 1 : 1e-3
+#else
+#define IPP_EPS 1e-3
+#endif
+
+OCL_TEST_P(CvtColor8u, RGBA2mRGBA) { performTest(4, 4, CVTCODE(RGBA2mRGBA), IPP_EPS); }
 OCL_TEST_P(CvtColor8u, mRGBA2RGBA) { performTest(4, 4, CVTCODE(mRGBA2RGBA)); }
 
 // RGB <-> Lab
 
 OCL_TEST_P(CvtColor8u32f, BGR2Lab) { performTest(3, 3, CVTCODE(BGR2Lab)); }
 OCL_TEST_P(CvtColor8u32f, RGB2Lab) { performTest(3, 3, CVTCODE(RGB2Lab)); }
-OCL_TEST_P(CvtColor8u32f, LBGR2Lab) { performTest(3, 3, CVTCODE(LBGR2Lab)); }
-OCL_TEST_P(CvtColor8u32f, LRGB2Lab) { performTest(3, 3, CVTCODE(LRGB2Lab)); }
+OCL_TEST_P(CvtColor8u32f, LBGR2Lab) { performTest(3, 3, CVTCODE(LBGR2Lab), IPP_EPS); }
+OCL_TEST_P(CvtColor8u32f, LRGB2Lab) { performTest(3, 3, CVTCODE(LRGB2Lab), IPP_EPS); }
 OCL_TEST_P(CvtColor8u32f, BGRA2Lab) { performTest(4, 3, CVTCODE(BGR2Lab)); }
 OCL_TEST_P(CvtColor8u32f, RGBA2Lab) { performTest(4, 3, CVTCODE(RGB2Lab)); }
-OCL_TEST_P(CvtColor8u32f, LBGRA2Lab) { performTest(4, 3, CVTCODE(LBGR2Lab)); }
-OCL_TEST_P(CvtColor8u32f, LRGBA2Lab) { performTest(4, 3, CVTCODE(LRGB2Lab)); }
+OCL_TEST_P(CvtColor8u32f, LBGRA2Lab) { performTest(4, 3, CVTCODE(LBGR2Lab), IPP_EPS); }
+OCL_TEST_P(CvtColor8u32f, LRGBA2Lab) { performTest(4, 3, CVTCODE(LRGB2Lab), IPP_EPS); }
+
+#undef IPP_EPS
 
 OCL_TEST_P(CvtColor8u32f, Lab2BGR) { performTest(3, 3, CVTCODE(Lab2BGR), depth == CV_8U ? 1 : 1e-5); }
 OCL_TEST_P(CvtColor8u32f, Lab2RGB) { performTest(3, 3, CVTCODE(Lab2RGB), depth == CV_8U ? 1 : 1e-5); }
diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp
index d2f508516..e46f5b054 100644
--- a/modules/imgproc/test/ocl/test_filters.cpp
+++ b/modules/imgproc/test/ocl/test_filters.cpp
@@ -219,7 +219,7 @@ OCL_TEST_P(GaussianBlurTest, Mat)
         OCL_OFF(cv::GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType));
         OCL_ON(cv::GaussianBlur(usrc_roi, udst_roi, Size(ksize, ksize), sigma1, sigma2, borderType));
 
-        Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 5e-5, false);
+        Near(CV_MAT_DEPTH(type) >= CV_32F ? 5e-5 : 1, false);
     }
 }
 
diff --git a/modules/imgproc/test/ocl/test_histogram.cpp b/modules/imgproc/test/ocl/test_histogram.cpp
index c13556b30..68a2a60fd 100644
--- a/modules/imgproc/test/ocl/test_histogram.cpp
+++ b/modules/imgproc/test/ocl/test_histogram.cpp
@@ -146,7 +146,7 @@ PARAM_TEST_CASE(CalcBackProject, MatDepth, int, bool)
         scale = randomDouble(0.1, 1);
     }
 
-    virtual void test_by_pict()
+    void test_by_pict()
     {
         Mat frame1 = readImage("optflow/RubberWhale1.png", IMREAD_GRAYSCALE);
 
@@ -174,7 +174,19 @@ PARAM_TEST_CASE(CalcBackProject, MatDepth, int, bool)
 
         OCL_OFF(calcBackProject(&frame1, 1, 0, hist1, dst1, &ranges1, 1, true));
         OCL_ON(calcBackProject(uims, chs, uhist1, udst1, urngs, 1.0));
-        EXPECT_MAT_NEAR(dst1, udst1, 0.0);
+
+        if (cv::ocl::useOpenCL() && cv::ocl::Device::getDefault().isAMD())
+        {
+            Size dstSize = dst1.size();
+            int nDiffs = (int)(0.03f*dstSize.height*dstSize.width);
+
+            //check if the dst mats are the same except 3% difference
+            EXPECT_MAT_N_DIFF(dst1, udst1, nDiffs);
+        }
+        else
+        {
+            EXPECT_MAT_NEAR(dst1, udst1, 0.0);
+        }
     }
 };
 
@@ -194,12 +206,15 @@ OCL_TEST_P(CalcBackProject, Mat)
 
         //check if the dst mats are the same except 3% difference
         EXPECT_MAT_N_DIFF(dst_roi, udst_roi, nDiffs);
-
-        //check in addition on given image
-        test_by_pict();
     }
 }
 
+OCL_TEST_P(CalcBackProject, Mat_RealImage)
+{
+    //check on given image
+    test_by_pict();
+}
+
 //////////////////////////////// CalcHist //////////////////////////////////////////////
 
 PARAM_TEST_CASE(CalcHist, bool)
diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp
index f9ccef8c6..d59cf75ad 100644
--- a/modules/imgproc/test/ocl/test_warp.cpp
+++ b/modules/imgproc/test/ocl/test_warp.cpp
@@ -158,9 +158,10 @@ OCL_TEST_P(WarpPerspective, Mat)
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //// resize
 
-PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
+PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool, int)
 {
     int type, interpolation;
+    int widthMultiple;
     double fx, fy;
     bool useRoi;
 
@@ -174,6 +175,7 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
         fy = GET_PARAM(2);
         interpolation = GET_PARAM(3);
         useRoi = GET_PARAM(4);
+        widthMultiple = GET_PARAM(5);
     }
 
     void random_roi()
@@ -181,6 +183,9 @@ PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
         CV_Assert(fx > 0 && fy > 0);
 
         Size srcRoiSize = randomSize(1, MAX_VALUE), dstRoiSize;
+        // Make sure the width is a multiple of the requested value, and no more
+        srcRoiSize.width &= ~((widthMultiple * 2) - 1);
+        srcRoiSize.width += widthMultiple;
         dstRoiSize.width = cvRound(srcRoiSize.width * fx);
         dstRoiSize.height = cvRound(srcRoiSize.height * fy);
 
@@ -334,14 +339,16 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Resize, Combine(
                             Values(0.5, 1.5, 2.0, 0.2),
                             Values(0.5, 1.5, 2.0, 0.2),
                             Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
-                            Bool()));
+                            Bool(),
+                            Values(1, 16)));
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
                             Values((MatType)CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
                             Values(0.7, 0.4, 0.5),
                             Values(0.3, 0.6, 0.5),
                             Values((Interpolation)INTER_AREA),
-                            Bool()));
+                            Bool(),
+                            Values(1, 16)));
 
 OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_LINEAR, Combine(
                             Values(CV_8U, CV_16U, CV_32F),
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index 0f4456ad5..a84c00d05 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -3525,7 +3525,6 @@ void HOGDescriptor::groupRectangles(std::vector<cv::Rect>& rectList, std::vector
     std::vector<cv::Rect_<double> > rrects(nclasses);
     std::vector<int> numInClass(nclasses, 0);
     std::vector<double> foundWeights(nclasses, DBL_MIN);
-    std::vector<double> totalFactorsPerClass(nclasses, 1);
     int i, j, nlabels = (int)labels.size();
 
     for( i = 0; i < nlabels; i++ )
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index dfb1d6898..e8b299946 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -134,6 +134,7 @@ typedef Ptr<DescriptorMatcher> Ptr_DescriptorMatcher;
 typedef Ptr<BackgroundSubtractor> Ptr_BackgroundSubtractor;
 typedef Ptr<BackgroundSubtractorMOG> Ptr_BackgroundSubtractorMOG;
 typedef Ptr<BackgroundSubtractorMOG2> Ptr_BackgroundSubtractorMOG2;
+typedef Ptr<BackgroundSubtractorKNN> Ptr_BackgroundSubtractorKNN;
 typedef Ptr<BackgroundSubtractorGMG> Ptr_BackgroundSubtractorGMG;
 
 typedef Ptr<StereoMatcher> Ptr_StereoMatcher;
diff --git a/modules/stitching/doc/blenders.rst b/modules/stitching/doc/blenders.rst
index 031a32664..39d9571c3 100644
--- a/modules/stitching/doc/blenders.rst
+++ b/modules/stitching/doc/blenders.rst
@@ -43,7 +43,7 @@ detail::Blender::feed
 
 Processes the image.
 
-.. ocv:function:: void detail::Blender::feed(const Mat &img, const Mat &mask, Point tl)
+.. ocv:function:: void detail::Blender::feed(InputArray img, InputArray mask, Point tl)
 
     :param img: Source image
 
@@ -56,7 +56,7 @@ detail::Blender::blend
 
 Blends and returns the final pano.
 
-.. ocv:function:: void detail::Blender::blend(Mat &dst, Mat &dst_mask)
+.. ocv:function:: void detail::Blender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 
     :param dst: Final pano
 
diff --git a/modules/stitching/doc/exposure_compensation.rst b/modules/stitching/doc/exposure_compensation.rst
index ec0d5db32..b2778f124 100644
--- a/modules/stitching/doc/exposure_compensation.rst
+++ b/modules/stitching/doc/exposure_compensation.rst
@@ -27,9 +27,9 @@ Base class for all exposure compensators. ::
 detail::ExposureCompensator::feed
 ----------------------------------
 
-.. ocv:function:: void detail::ExposureCompensator::feed(const std::vector<Point> &corners, const std::vector<Mat> &images, const std::vector<Mat> &masks)
+.. ocv:function:: void detail::ExposureCompensator::feed(const std::vector<Point> &corners, const std::vector<UMat> &images, const std::vector<UMat> &masks)
 
-.. ocv:function:: void detail::ExposureCompensator::feed(const std::vector<Point> &corners, const std::vector<Mat> &images, const std::vector<std::pair<Mat,uchar> > &masks)
+.. ocv:function:: void detail::ExposureCompensator::feed(const std::vector<Point> &corners, const std::vector<UMat> &images, const std::vector<std::pair<UMat,uchar> > &masks)
 
     :param corners: Source image top-left corners
 
@@ -42,7 +42,7 @@ detil::ExposureCompensator::apply
 
 Compensate exposure in the specified image.
 
-.. ocv:function:: void detail::ExposureCompensator::apply(int index, Point corner, Mat &image, const Mat &mask)
+.. ocv:function:: void detail::ExposureCompensator::apply(int index, Point corner, InputOutputArray image, InputArray mask)
 
     :param index: Image index
 
diff --git a/modules/stitching/doc/high_level.rst b/modules/stitching/doc/high_level.rst
index 955f8f248..4cc5f9914 100644
--- a/modules/stitching/doc/high_level.rst
+++ b/modules/stitching/doc/high_level.rst
@@ -110,9 +110,9 @@ These functions try to match the given images and to estimate rotations of each
 
 .. note:: Use the functions only if you're aware of the stitching pipeline, otherwise use :ocv:func:`Stitcher::stitch`.
 
-.. ocv:function:: Status Stitcher::estimateTransform(InputArray images)
+.. ocv:function:: Status Stitcher::estimateTransform(InputArrayOfArrays images)
 
-.. ocv:function:: Status Stitcher::estimateTransform(InputArray images, const std::vector<std::vector<Rect> > &rois)
+.. ocv:function:: Status Stitcher::estimateTransform(InputArrayOfArrays images, const std::vector<std::vector<Rect> > &rois)
 
     :param images: Input images.
 
@@ -129,7 +129,7 @@ These functions try to compose the given images (or images stored internally fro
 
 .. ocv:function:: Status Stitcher::composePanorama(OutputArray pano)
 
-.. ocv:function:: Status Stitcher::composePanorama(InputArray images, OutputArray pano)
+.. ocv:function:: Status Stitcher::composePanorama(InputArrayOfArrays images, OutputArray pano)
 
     :param images: Input images.
 
@@ -142,9 +142,9 @@ Stitcher::stitch
 
 These functions try to stitch the given images.
 
-.. ocv:function:: Status Stitcher::stitch(InputArray images, OutputArray pano)
+.. ocv:function:: Status Stitcher::stitch(InputArrayOfArrays images, OutputArray pano)
 
-.. ocv:function:: Status Stitcher::stitch(InputArray images, const std::vector<std::vector<Rect> > &rois, OutputArray pano)
+.. ocv:function:: Status Stitcher::stitch(InputArrayOfArrays images, const std::vector<std::vector<Rect> > &rois, OutputArray pano)
 
     :param images: Input images.
 
diff --git a/modules/stitching/doc/matching.rst b/modules/stitching/doc/matching.rst
index 2f9dabf72..9f112d0a0 100644
--- a/modules/stitching/doc/matching.rst
+++ b/modules/stitching/doc/matching.rst
@@ -40,9 +40,9 @@ detail::FeaturesFinder::operator()
 
 Finds features in the given image.
 
-.. ocv:function:: void detail::FeaturesFinder::operator ()(const Mat &image, ImageFeatures &features)
+.. ocv:function:: void detail::FeaturesFinder::operator ()(InputArray image, ImageFeatures &features)
 
-.. ocv:function:: void detail::FeaturesFinder::operator ()(const Mat &image, ImageFeatures &features, const std::vector<cv::Rect> &rois)
+.. ocv:function:: void detail::FeaturesFinder::operator ()(InputArray image, ImageFeatures &features, const std::vector<cv::Rect> &rois)
 
     :param image: Source image
 
@@ -64,7 +64,7 @@ detail::FeaturesFinder::find
 
 This method must implement features finding logic in order to make the wrappers `detail::FeaturesFinder::operator()`_ work.
 
-.. ocv:function:: void detail::FeaturesFinder::find(const Mat &image, ImageFeatures &features)
+.. ocv:function:: void detail::FeaturesFinder::find(InputArray image, ImageFeatures &features)
 
     :param image: Source image
 
@@ -171,7 +171,7 @@ Performs images matching.
 
     :param matches_info: Found matches
 
-.. ocv:function:: void detail::FeaturesMatcher::operator ()( const std::vector<ImageFeatures> & features, std::vector<MatchesInfo> & pairwise_matches, const Mat & mask=Mat() )
+.. ocv:function:: void detail::FeaturesMatcher::operator ()( const std::vector<ImageFeatures> & features, std::vector<MatchesInfo> & pairwise_matches, const UMat & mask=UMat() )
 
     :param features: Features of the source images
 
diff --git a/modules/stitching/doc/seam_estimation.rst b/modules/stitching/doc/seam_estimation.rst
index 119fb1e15..e7a3e47b2 100644
--- a/modules/stitching/doc/seam_estimation.rst
+++ b/modules/stitching/doc/seam_estimation.rst
@@ -22,7 +22,7 @@ detail::SeamFinder::find
 
 Estimates seams.
 
-.. ocv:function:: void detail::SeamFinder::find(const std::vector<Mat> &src, const std::vector<Point> &corners, std::vector<Mat> &masks)
+.. ocv:function:: void detail::SeamFinder::find(const std::vector<UMat> &src, const std::vector<Point> &corners, std::vector<UMat> &masks)
 
     :param src: Source images
 
diff --git a/modules/stitching/include/opencv2/stitching.hpp b/modules/stitching/include/opencv2/stitching.hpp
index 2c48f2f2a..b647c0fe2 100644
--- a/modules/stitching/include/opencv2/stitching.hpp
+++ b/modules/stitching/include/opencv2/stitching.hpp
@@ -98,8 +98,8 @@ public:
     void setFeaturesMatcher(Ptr<detail::FeaturesMatcher> features_matcher)
         { features_matcher_ = features_matcher; }
 
-    const cv::Mat& matchingMask() const { return matching_mask_; }
-    void setMatchingMask(const cv::Mat &mask)
+    const cv::UMat& matchingMask() const { return matching_mask_; }
+    void setMatchingMask(const cv::UMat &mask)
     {
         CV_Assert(mask.type() == CV_8U && mask.cols == mask.rows);
         matching_mask_ = mask.clone();
@@ -127,14 +127,14 @@ public:
     const Ptr<detail::Blender> blender() const { return blender_; }
     void setBlender(Ptr<detail::Blender> b) { blender_ = b; }
 
-    Status estimateTransform(InputArray images);
-    Status estimateTransform(InputArray images, const std::vector<std::vector<Rect> > &rois);
+    Status estimateTransform(InputArrayOfArrays images);
+    Status estimateTransform(InputArrayOfArrays images, const std::vector<std::vector<Rect> > &rois);
 
     Status composePanorama(OutputArray pano);
-    Status composePanorama(InputArray images, OutputArray pano);
+    Status composePanorama(InputArrayOfArrays images, OutputArray pano);
 
-    Status stitch(InputArray images, OutputArray pano);
-    Status stitch(InputArray images, const std::vector<std::vector<Rect> > &rois, OutputArray pano);
+    Status stitch(InputArrayOfArrays images, OutputArray pano);
+    Status stitch(InputArrayOfArrays images, const std::vector<std::vector<Rect> > &rois, OutputArray pano);
 
     std::vector<int> component() const { return indices_; }
     std::vector<detail::CameraParams> cameras() const { return cameras_; }
@@ -152,7 +152,7 @@ private:
     double conf_thresh_;
     Ptr<detail::FeaturesFinder> features_finder_;
     Ptr<detail::FeaturesMatcher> features_matcher_;
-    cv::Mat matching_mask_;
+    cv::UMat matching_mask_;
     Ptr<detail::BundleAdjusterBase> bundle_adjuster_;
     bool do_wave_correct_;
     detail::WaveCorrectKind wave_correct_kind_;
@@ -161,12 +161,12 @@ private:
     Ptr<detail::SeamFinder> seam_finder_;
     Ptr<detail::Blender> blender_;
 
-    std::vector<cv::Mat> imgs_;
+    std::vector<cv::UMat> imgs_;
     std::vector<std::vector<cv::Rect> > rois_;
     std::vector<cv::Size> full_img_sizes_;
     std::vector<detail::ImageFeatures> features_;
     std::vector<detail::MatchesInfo> pairwise_matches_;
-    std::vector<cv::Mat> seam_est_imgs_;
+    std::vector<cv::UMat> seam_est_imgs_;
     std::vector<int> indices_;
     std::vector<detail::CameraParams> cameras_;
     double work_scale_;
diff --git a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
index 8d47d3225..f91a0eae6 100644
--- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@@ -60,11 +60,11 @@ public:
 
     void prepare(const std::vector<Point> &corners, const std::vector<Size> &sizes);
     virtual void prepare(Rect dst_roi);
-    virtual void feed(const Mat &img, const Mat &mask, Point tl);
-    virtual void blend(Mat &dst, Mat &dst_mask);
+    virtual void feed(InputArray img, InputArray mask, Point tl);
+    virtual void blend(InputOutputArray dst, InputOutputArray dst_mask);
 
 protected:
-    Mat dst_, dst_mask_;
+    UMat dst_, dst_mask_;
     Rect dst_roi_;
 };
 
@@ -78,18 +78,18 @@ public:
     void setSharpness(float val) { sharpness_ = val; }
 
     void prepare(Rect dst_roi);
-    void feed(const Mat &img, const Mat &mask, Point tl);
-    void blend(Mat &dst, Mat &dst_mask);
+    void feed(InputArray img, InputArray mask, Point tl);
+    void blend(InputOutputArray dst, InputOutputArray dst_mask);
 
     // Creates weight maps for fixed set of source images by their masks and top-left corners.
     // Final image can be obtained by simple weighting of the source images.
-    Rect createWeightMaps(const std::vector<Mat> &masks, const std::vector<Point> &corners,
-                          std::vector<Mat> &weight_maps);
+    Rect createWeightMaps(const std::vector<UMat> &masks, const std::vector<Point> &corners,
+                          std::vector<UMat> &weight_maps);
 
 private:
     float sharpness_;
-    Mat weight_map_;
-    Mat dst_weight_map_;
+    UMat weight_map_;
+    UMat dst_weight_map_;
 };
 
 inline FeatherBlender::FeatherBlender(float _sharpness) { setSharpness(_sharpness); }
@@ -104,13 +104,13 @@ public:
     void setNumBands(int val) { actual_num_bands_ = val; }
 
     void prepare(Rect dst_roi);
-    void feed(const Mat &img, const Mat &mask, Point tl);
-    void blend(Mat &dst, Mat &dst_mask);
+    void feed(InputArray img, InputArray mask, Point tl);
+    void blend(InputOutputArray dst, InputOutputArray dst_mask);
 
 private:
     int actual_num_bands_, num_bands_;
-    std::vector<Mat> dst_pyr_laplace_;
-    std::vector<Mat> dst_band_weights_;
+    std::vector<UMat> dst_pyr_laplace_;
+    std::vector<UMat> dst_band_weights_;
     Rect dst_roi_final_;
     bool can_use_gpu_;
     int weight_type_; //CV_32F or CV_16S
@@ -120,16 +120,16 @@ private:
 //////////////////////////////////////////////////////////////////////////////
 // Auxiliary functions
 
-void CV_EXPORTS normalizeUsingWeightMap(const Mat& weight, Mat& src);
+void CV_EXPORTS normalizeUsingWeightMap(InputArray weight, InputOutputArray src);
 
-void CV_EXPORTS createWeightMap(const Mat& mask, float sharpness, Mat& weight);
+void CV_EXPORTS createWeightMap(InputArray mask, float sharpness, InputOutputArray weight);
 
-void CV_EXPORTS createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat>& pyr);
-void CV_EXPORTS createLaplacePyrGpu(const Mat &img, int num_levels, std::vector<Mat>& pyr);
+void CV_EXPORTS createLaplacePyr(InputArray img, int num_levels, std::vector<UMat>& pyr);
+void CV_EXPORTS createLaplacePyrGpu(InputArray img, int num_levels, std::vector<UMat>& pyr);
 
 // Restores source image
-void CV_EXPORTS restoreImageFromLaplacePyr(std::vector<Mat>& pyr);
-void CV_EXPORTS restoreImageFromLaplacePyrGpu(std::vector<Mat>& pyr);
+void CV_EXPORTS restoreImageFromLaplacePyr(std::vector<UMat>& pyr);
+void CV_EXPORTS restoreImageFromLaplacePyrGpu(std::vector<UMat>& pyr);
 
 } // namespace detail
 } // namespace cv
diff --git a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
index 84a8ce4fe..9cd8b32f4 100644
--- a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
@@ -56,29 +56,29 @@ public:
     enum { NO, GAIN, GAIN_BLOCKS };
     static Ptr<ExposureCompensator> createDefault(int type);
 
-    void feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-              const std::vector<Mat> &masks);
-    virtual void feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-                      const std::vector<std::pair<Mat,uchar> > &masks) = 0;
-    virtual void apply(int index, Point corner, Mat &image, const Mat &mask) = 0;
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<UMat> &masks);
+    virtual void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+                      const std::vector<std::pair<UMat,uchar> > &masks) = 0;
+    virtual void apply(int index, Point corner, InputOutputArray image, InputArray mask) = 0;
 };
 
 
 class CV_EXPORTS NoExposureCompensator : public ExposureCompensator
 {
 public:
-    void feed(const std::vector<Point> &/*corners*/, const std::vector<Mat> &/*images*/,
-              const std::vector<std::pair<Mat,uchar> > &/*masks*/) { }
-    void apply(int /*index*/, Point /*corner*/, Mat &/*image*/, const Mat &/*mask*/) { }
+    void feed(const std::vector<Point> &/*corners*/, const std::vector<UMat> &/*images*/,
+              const std::vector<std::pair<UMat,uchar> > &/*masks*/) { }
+    void apply(int /*index*/, Point /*corner*/, InputOutputArray /*image*/, InputArray /*mask*/) { }
 };
 
 
 class CV_EXPORTS GainCompensator : public ExposureCompensator
 {
 public:
-    void feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-              const std::vector<std::pair<Mat,uchar> > &masks);
-    void apply(int index, Point corner, Mat &image, const Mat &mask);
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks);
+    void apply(int index, Point corner, InputOutputArray image, InputArray mask);
     std::vector<double> gains() const;
 
 private:
@@ -91,13 +91,13 @@ class CV_EXPORTS BlocksGainCompensator : public ExposureCompensator
 public:
     BlocksGainCompensator(int bl_width = 32, int bl_height = 32)
             : bl_width_(bl_width), bl_height_(bl_height) {}
-    void feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-              const std::vector<std::pair<Mat,uchar> > &masks);
-    void apply(int index, Point corner, Mat &image, const Mat &mask);
+    void feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+              const std::vector<std::pair<UMat,uchar> > &masks);
+    void apply(int index, Point corner, InputOutputArray image, InputArray mask);
 
 private:
     int bl_width_, bl_height_;
-    std::vector<Mat_<float> > gain_maps_;
+    std::vector<UMat> gain_maps_;
 };
 
 } // namespace detail
diff --git a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
index 02f86608c..8c3284a46 100644
--- a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
@@ -60,7 +60,7 @@ struct CV_EXPORTS ImageFeatures
     int img_idx;
     Size img_size;
     std::vector<KeyPoint> keypoints;
-    Mat descriptors;
+    UMat descriptors;
 };
 
 
@@ -68,12 +68,12 @@ class CV_EXPORTS FeaturesFinder
 {
 public:
     virtual ~FeaturesFinder() {}
-    void operator ()(const Mat &image, ImageFeatures &features);
-    void operator ()(const Mat &image, ImageFeatures &features, const std::vector<cv::Rect> &rois);
+    void operator ()(InputArray image, ImageFeatures &features);
+    void operator ()(InputArray image, ImageFeatures &features, const std::vector<cv::Rect> &rois);
     virtual void collectGarbage() {}
 
 protected:
-    virtual void find(const Mat &image, ImageFeatures &features) = 0;
+    virtual void find(InputArray image, ImageFeatures &features) = 0;
 };
 
 
@@ -84,7 +84,7 @@ public:
                        int num_octaves_descr = /*4*/3, int num_layers_descr = /*2*/4);
 
 private:
-    void find(const Mat &image, ImageFeatures &features);
+    void find(InputArray image, ImageFeatures &features);
 
     Ptr<FeatureDetector> detector_;
     Ptr<DescriptorExtractor> extractor_;
@@ -97,7 +97,7 @@ public:
     OrbFeaturesFinder(Size _grid_size = Size(3,1), int nfeatures=1500, float scaleFactor=1.3f, int nlevels=5);
 
 private:
-    void find(const Mat &image, ImageFeatures &features);
+    void find(InputArray image, ImageFeatures &features);
 
     Ptr<ORB> orb;
     Size grid_size;
@@ -114,7 +114,7 @@ public:
     void collectGarbage();
 
 private:
-    void find(const Mat &image, ImageFeatures &features);
+    void find(InputArray image, ImageFeatures &features);
 
     cuda::GpuMat image_;
     cuda::GpuMat gray_image_;
@@ -151,7 +151,7 @@ public:
                      MatchesInfo& matches_info) { match(features1, features2, matches_info); }
 
     void operator ()(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
-                     const cv::Mat &mask = cv::Mat());
+                     const cv::UMat &mask = cv::UMat());
 
     bool isThreadSafe() const { return is_thread_safe_; }
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
index 24b7db6b5..5f085c1a4 100644
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -54,32 +54,32 @@ class CV_EXPORTS SeamFinder
 {
 public:
     virtual ~SeamFinder() {}
-    virtual void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                      std::vector<Mat> &masks) = 0;
+    virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      std::vector<UMat> &masks) = 0;
 };
 
 
 class CV_EXPORTS NoSeamFinder : public SeamFinder
 {
 public:
-    void find(const std::vector<Mat>&, const std::vector<Point>&, std::vector<Mat>&) {}
+    void find(const std::vector<UMat>&, const std::vector<Point>&, std::vector<UMat>&) {}
 };
 
 
 class CV_EXPORTS PairwiseSeamFinder : public SeamFinder
 {
 public:
-    virtual void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                      std::vector<Mat> &masks);
+    virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      std::vector<UMat> &masks);
 
 protected:
     void run();
     virtual void findInPair(size_t first, size_t second, Rect roi) = 0;
 
-    std::vector<Mat> images_;
+    std::vector<UMat> images_;
     std::vector<Size> sizes_;
     std::vector<Point> corners_;
-    std::vector<Mat> masks_;
+    std::vector<UMat> masks_;
 };
 
 
@@ -87,7 +87,7 @@ class CV_EXPORTS VoronoiSeamFinder : public PairwiseSeamFinder
 {
 public:
     virtual void find(const std::vector<Size> &size, const std::vector<Point> &corners,
-                      std::vector<Mat> &masks);
+                      std::vector<UMat> &masks);
 private:
     void findInPair(size_t first, size_t second, Rect roi);
 };
@@ -103,8 +103,8 @@ public:
     CostFunction costFunction() const { return costFunc_; }
     void setCostFunction(CostFunction val) { costFunc_ = val; }
 
-    virtual void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                      std::vector<Mat> &masks);
+    virtual void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                      std::vector<UMat> &masks);
 
 private:
     enum ComponentState
@@ -154,7 +154,7 @@ private:
     };
 
     void process(
-            const Mat &image1, const Mat &image2, Point tl1, Point tl2, Mat &mask1, Mat &mask2);
+            const Mat &image1, const Mat &image2, Point tl1, Point tl2,  Mat &mask1, Mat &mask2);
 
     void findComponents();
 
@@ -217,8 +217,8 @@ public:
 
     ~GraphCutSeamFinder();
 
-    void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-              std::vector<Mat> &masks);
+    void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+              std::vector<UMat> &masks);
 
 private:
     // To avoid GCGraph dependency
@@ -236,8 +236,8 @@ public:
                           : cost_type_(cost_type), terminal_cost_(terminal_cost),
                             bad_region_penalty_(bad_region_penalty) {}
 
-    void find(const std::vector<cv::Mat> &src, const std::vector<cv::Point> &corners,
-              std::vector<cv::Mat> &masks);
+    void find(const std::vector<cv::UMat> &src, const std::vector<cv::Point> &corners,
+              std::vector<cv::UMat> &masks);
     void findInPair(size_t first, size_t second, Rect roi);
 
 private:
diff --git a/modules/stitching/include/opencv2/stitching/detail/util.hpp b/modules/stitching/include/opencv2/stitching/detail/util.hpp
index 6f334b9ad..6b1c5f34f 100644
--- a/modules/stitching/include/opencv2/stitching/detail/util.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/util.hpp
@@ -71,6 +71,7 @@
 #define LOG_(_level, _msg)                     \
     for(;;)                                    \
     {                                          \
+        using namespace std;                   \
         if ((_level) >= ::cv::detail::stitchingLogLevel()) \
         {                                      \
             LOG_STITCHING_MSG(_msg);           \
@@ -145,7 +146,7 @@ private:
 // Auxiliary functions
 
 CV_EXPORTS bool overlapRoi(Point tl1, Point tl2, Size sz1, Size sz2, Rect &roi);
-CV_EXPORTS Rect resultRoi(const std::vector<Point> &corners, const std::vector<Mat> &images);
+CV_EXPORTS Rect resultRoi(const std::vector<Point> &corners, const std::vector<UMat> &images);
 CV_EXPORTS Rect resultRoi(const std::vector<Point> &corners, const std::vector<Size> &sizes);
 CV_EXPORTS Point resultTl(const std::vector<Point> &corners);
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 093f07cc1..c8869f116 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -160,6 +160,8 @@ class CV_EXPORTS SphericalWarper : public RotationWarperBase<SphericalProjector>
 public:
     SphericalWarper(float scale) { projector_.scale = scale; }
 
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
 protected:
     void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br);
 };
@@ -178,6 +180,8 @@ class CV_EXPORTS CylindricalWarper : public RotationWarperBase<CylindricalProjec
 public:
     CylindricalWarper(float scale) { projector_.scale = scale; }
 
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
 protected:
     void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br)
     {
@@ -503,45 +507,6 @@ protected:
     }
 };
 
-/////////////////////////////////////// OpenCL Accelerated Warpers /////////////////////////////////////
-
-class CV_EXPORTS PlaneWarperOcl : public PlaneWarper
-{
-public:
-    PlaneWarperOcl(float scale = 1.f) : PlaneWarper(scale) { }
-
-    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
-    {
-        return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32FC1), xmap, ymap);
-    }
-
-    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
-    {
-        return warp(src, K, R, Mat::zeros(3, 1, CV_32FC1), interp_mode, border_mode, dst);
-    }
-
-    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
-    virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, OutputArray dst);
-};
-
-class CV_EXPORTS SphericalWarperOcl :  public SphericalWarper
-{
-public:
-    SphericalWarperOcl(float scale) : SphericalWarper(scale) { }
-
-    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
-    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
-};
-
-class CV_EXPORTS CylindricalWarperOcl :  public CylindricalWarper
-{
-public:
-    CylindricalWarperOcl(float scale) : CylindricalWarper(scale) { }
-
-    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
-    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
-};
-
 } // namespace detail
 } // namespace cv
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
index f6eae4fa7..7dcbb6cb5 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
@@ -92,7 +92,7 @@ template <class P>
 Point RotationWarperBase<P>::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
                                   OutputArray dst)
 {
-    Mat xmap, ymap;
+    UMat xmap, ymap;
     Rect dst_roi = buildMaps(src.size(), K, R, xmap, ymap);
 
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index cdcb35c20..da5fe2618 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -167,24 +167,6 @@ public:
 };
 #endif
 
-class PlaneWarperOcl: public WarperCreator
-{
-public:
-    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::PlaneWarperOcl>(scale); }
-};
-
-class SphericalWarperOcl: public WarperCreator
-{
-public:
-    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::SphericalWarperOcl>(scale); }
-};
-
-class CylindricalWarperOcl: public WarperCreator
-{
-public:
-    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::CylindricalWarperOcl>(scale); }
-};
-
 } // namespace cv
 
 #endif // __OPENCV_STITCHING_WARPER_CREATORS_HPP__
diff --git a/modules/stitching/perf/opencl/perf_stitch.cpp b/modules/stitching/perf/opencl/perf_stitch.cpp
new file mode 100644
index 000000000..343472699
--- /dev/null
+++ b/modules/stitching/perf/opencl/perf_stitch.cpp
@@ -0,0 +1,144 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+using namespace cv;
+using namespace perf;
+using namespace cvtest::ocl;
+using namespace std;
+using namespace std::tr1;
+
+#define SURF_MATCH_CONFIDENCE 0.65f
+#define ORB_MATCH_CONFIDENCE  0.3f
+#define WORK_MEGAPIX 0.6
+
+typedef TestBaseWithParam<string> stitch;
+
+#ifdef HAVE_OPENCV_NONFREE_TODO_FIND_WHY_SURF_IS_NOT_ABLE_TO_STITCH_PANOS
+#define TEST_DETECTORS testing::Values("surf", "orb")
+#else
+#define TEST_DETECTORS testing::Values<string>("orb")
+#endif
+
+OCL_PERF_TEST_P(stitch, a123, TEST_DETECTORS)
+{
+    UMat pano;
+
+    vector<Mat> _imgs;
+    _imgs.push_back( imread( getDataPath("stitching/a1.png") ) );
+    _imgs.push_back( imread( getDataPath("stitching/a2.png") ) );
+    _imgs.push_back( imread( getDataPath("stitching/a3.png") ) );
+    vector<UMat> imgs = ToUMat(_imgs);
+
+    Ptr<detail::FeaturesFinder> featuresFinder = GetParam() == "orb"
+            ? Ptr<detail::FeaturesFinder>(new detail::OrbFeaturesFinder())
+            : Ptr<detail::FeaturesFinder>(new detail::SurfFeaturesFinder());
+
+    Ptr<detail::FeaturesMatcher> featuresMatcher = GetParam() == "orb"
+            ? makePtr<detail::BestOf2NearestMatcher>(false, ORB_MATCH_CONFIDENCE)
+            : makePtr<detail::BestOf2NearestMatcher>(false, SURF_MATCH_CONFIDENCE);
+
+    declare.iterations(20);
+
+    while(next())
+    {
+        Stitcher stitcher = Stitcher::createDefault();
+        stitcher.setFeaturesFinder(featuresFinder);
+        stitcher.setFeaturesMatcher(featuresMatcher);
+        stitcher.setWarper(makePtr<SphericalWarper>());
+        stitcher.setRegistrationResol(WORK_MEGAPIX);
+
+        startTimer();
+        stitcher.stitch(imgs, pano);
+        stopTimer();
+    }
+
+    EXPECT_NEAR(pano.size().width, 1182, 50);
+    EXPECT_NEAR(pano.size().height, 682, 30);
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST_P(stitch, b12, TEST_DETECTORS)
+{
+    UMat pano;
+
+    vector<Mat> imgs;
+    imgs.push_back( imread( getDataPath("stitching/b1.png") ) );
+    imgs.push_back( imread( getDataPath("stitching/b2.png") ) );
+
+    Ptr<detail::FeaturesFinder> featuresFinder = GetParam() == "orb"
+            ? Ptr<detail::FeaturesFinder>(new detail::OrbFeaturesFinder())
+            : Ptr<detail::FeaturesFinder>(new detail::SurfFeaturesFinder());
+
+    Ptr<detail::FeaturesMatcher> featuresMatcher = GetParam() == "orb"
+            ? makePtr<detail::BestOf2NearestMatcher>(false, ORB_MATCH_CONFIDENCE)
+            : makePtr<detail::BestOf2NearestMatcher>(false, SURF_MATCH_CONFIDENCE);
+
+    declare.iterations(20);
+
+    while(next())
+    {
+        Stitcher stitcher = Stitcher::createDefault();
+        stitcher.setFeaturesFinder(featuresFinder);
+        stitcher.setFeaturesMatcher(featuresMatcher);
+        stitcher.setWarper(makePtr<SphericalWarper>());
+        stitcher.setRegistrationResol(WORK_MEGAPIX);
+
+        startTimer();
+        stitcher.stitch(imgs, pano);
+        stopTimer();
+    }
+
+    EXPECT_NEAR(pano.size().width, 1124, 50);
+    EXPECT_NEAR(pano.size().height, 644, 30);
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST_P(stitch, boat, TEST_DETECTORS)
+{
+    UMat pano;
+
+    vector<Mat> _imgs;
+    _imgs.push_back( imread( getDataPath("stitching/boat1.jpg") ) );
+    _imgs.push_back( imread( getDataPath("stitching/boat2.jpg") ) );
+    _imgs.push_back( imread( getDataPath("stitching/boat3.jpg") ) );
+    _imgs.push_back( imread( getDataPath("stitching/boat4.jpg") ) );
+    _imgs.push_back( imread( getDataPath("stitching/boat5.jpg") ) );
+    _imgs.push_back( imread( getDataPath("stitching/boat6.jpg") ) );
+    vector<UMat> imgs = ToUMat(_imgs);
+
+    Ptr<detail::FeaturesFinder> featuresFinder = GetParam() == "orb"
+            ? Ptr<detail::FeaturesFinder>(new detail::OrbFeaturesFinder())
+            : Ptr<detail::FeaturesFinder>(new detail::SurfFeaturesFinder());
+
+    Ptr<detail::FeaturesMatcher> featuresMatcher = GetParam() == "orb"
+            ? makePtr<detail::BestOf2NearestMatcher>(false, ORB_MATCH_CONFIDENCE)
+            : makePtr<detail::BestOf2NearestMatcher>(false, SURF_MATCH_CONFIDENCE);
+
+    declare.iterations(20);
+
+    while(next())
+    {
+        Stitcher stitcher = Stitcher::createDefault();
+        stitcher.setFeaturesFinder(featuresFinder);
+        stitcher.setFeaturesMatcher(featuresMatcher);
+        stitcher.setWarper(makePtr<SphericalWarper>());
+        stitcher.setRegistrationResol(WORK_MEGAPIX);
+
+        startTimer();
+        stitcher.stitch(imgs, pano);
+        stopTimer();
+    }
+
+    EXPECT_NEAR(pano.size().width, 10789, 200);
+    EXPECT_NEAR(pano.size().height, 2663, 100);
+
+    SANITY_CHECK_NOTHING();
+}
diff --git a/modules/stitching/perf/opencl/perf_warpers.cpp b/modules/stitching/perf/opencl/perf_warpers.cpp
index 21fe22da7..6a8be4ebe 100644
--- a/modules/stitching/perf/opencl/perf_warpers.cpp
+++ b/modules/stitching/perf/opencl/perf_warpers.cpp
@@ -63,24 +63,12 @@ public:
     explicit WarperBase(int type, Size srcSize)
     {
         Ptr<WarperCreator> creator;
-        if (cv::ocl::useOpenCL())
-        {
-            if (type == SphericalWarperType)
-                creator = makePtr<SphericalWarperOcl>();
-            else if (type == CylindricalWarperType)
-                creator = makePtr<CylindricalWarperOcl>();
-            else if (type == PlaneWarperType)
-                creator = makePtr<PlaneWarperOcl>();
-        }
-        else
-        {
-            if (type == SphericalWarperType)
-                creator = makePtr<SphericalWarper>();
-            else if (type == CylindricalWarperType)
-                creator = makePtr<CylindricalWarper>();
-            else if (type == PlaneWarperType)
-                creator = makePtr<PlaneWarper>();
-        }
+        if (type == SphericalWarperType)
+            creator = makePtr<SphericalWarper>();
+        else if (type == CylindricalWarperType)
+            creator = makePtr<CylindricalWarper>();
+        else if (type == PlaneWarperType)
+            creator = makePtr<PlaneWarper>();
         CV_Assert(!creator.empty());
 
         K = Mat::eye(3, 3, CV_32FC1);
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index 446bfc131..03aad752f 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 namespace cv {
 namespace detail {
@@ -76,8 +77,13 @@ void Blender::prepare(Rect dst_roi)
 }
 
 
-void Blender::feed(const Mat &img, const Mat &mask, Point tl)
+void Blender::feed(InputArray _img, InputArray _mask, Point tl)
 {
+    Mat img = _img.getMat();
+    Mat mask = _mask.getMat();
+    Mat dst = dst_.getMat(ACCESS_RW);
+    Mat dst_mask = dst_mask_.getMat(ACCESS_RW);
+
     CV_Assert(img.type() == CV_16SC3);
     CV_Assert(mask.type() == CV_8U);
     int dx = tl.x - dst_roi_.x;
@@ -86,9 +92,9 @@ void Blender::feed(const Mat &img, const Mat &mask, Point tl)
     for (int y = 0; y < img.rows; ++y)
     {
         const Point3_<short> *src_row = img.ptr<Point3_<short> >(y);
-        Point3_<short> *dst_row = dst_.ptr<Point3_<short> >(dy + y);
+        Point3_<short> *dst_row = dst.ptr<Point3_<short> >(dy + y);
         const uchar *mask_row = mask.ptr<uchar>(y);
-        uchar *dst_mask_row = dst_mask_.ptr<uchar>(dy + y);
+        uchar *dst_mask_row = dst_mask.ptr<uchar>(dy + y);
 
         for (int x = 0; x < img.cols; ++x)
         {
@@ -100,11 +106,13 @@ void Blender::feed(const Mat &img, const Mat &mask, Point tl)
 }
 
 
-void Blender::blend(Mat &dst, Mat &dst_mask)
+void Blender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 {
-    dst_.setTo(Scalar::all(0), dst_mask_ == 0);
-    dst = dst_;
-    dst_mask = dst_mask_;
+    UMat mask;
+    compare(dst_mask_, 0, mask, CMP_EQ);
+    dst_.setTo(Scalar::all(0), mask);
+    dst.assign(dst_);
+    dst_mask.assign(dst_mask_);
     dst_.release();
     dst_mask_.release();
 }
@@ -118,21 +126,27 @@ void FeatherBlender::prepare(Rect dst_roi)
 }
 
 
-void FeatherBlender::feed(const Mat &img, const Mat &mask, Point tl)
+void FeatherBlender::feed(InputArray _img, InputArray mask, Point tl)
 {
+    Mat img = _img.getMat();
+    Mat dst = dst_.getMat(ACCESS_RW);
+
     CV_Assert(img.type() == CV_16SC3);
     CV_Assert(mask.type() == CV_8U);
 
     createWeightMap(mask, sharpness_, weight_map_);
+    Mat weight_map = weight_map_.getMat(ACCESS_READ);
+    Mat dst_weight_map = dst_weight_map_.getMat(ACCESS_RW);
+
     int dx = tl.x - dst_roi_.x;
     int dy = tl.y - dst_roi_.y;
 
     for (int y = 0; y < img.rows; ++y)
     {
         const Point3_<short>* src_row = img.ptr<Point3_<short> >(y);
-        Point3_<short>* dst_row = dst_.ptr<Point3_<short> >(dy + y);
-        const float* weight_row = weight_map_.ptr<float>(y);
-        float* dst_weight_row = dst_weight_map_.ptr<float>(dy + y);
+        Point3_<short>* dst_row = dst.ptr<Point3_<short> >(dy + y);
+        const float* weight_row = weight_map.ptr<float>(y);
+        float* dst_weight_row = dst_weight_map.ptr<float>(dy + y);
 
         for (int x = 0; x < img.cols; ++x)
         {
@@ -145,16 +159,16 @@ void FeatherBlender::feed(const Mat &img, const Mat &mask, Point tl)
 }
 
 
-void FeatherBlender::blend(Mat &dst, Mat &dst_mask)
+void FeatherBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 {
     normalizeUsingWeightMap(dst_weight_map_, dst_);
-    dst_mask_ = dst_weight_map_ > WEIGHT_EPS;
+    compare(dst_weight_map_, WEIGHT_EPS, dst_mask_, CMP_GT);
     Blender::blend(dst, dst_mask);
 }
 
 
-Rect FeatherBlender::createWeightMaps(const std::vector<Mat> &masks, const std::vector<Point> &corners,
-                                      std::vector<Mat> &weight_maps)
+Rect FeatherBlender::createWeightMaps(const std::vector<UMat> &masks, const std::vector<Point> &corners,
+                                      std::vector<UMat> &weight_maps)
 {
     weight_maps.resize(masks.size());
     for (size_t i = 0; i < masks.size(); ++i)
@@ -168,7 +182,7 @@ Rect FeatherBlender::createWeightMaps(const std::vector<Mat> &masks, const std::
     {
         Rect roi(corners[i].x - dst_roi.x, corners[i].y - dst_roi.y,
                  weight_maps[i].cols, weight_maps[i].rows);
-        weights_sum(roi) += weight_maps[i];
+        add(weights_sum(roi), weight_maps[i], weights_sum(roi));
     }
 
     for (size_t i = 0; i < weight_maps.size(); ++i)
@@ -232,9 +246,39 @@ void MultiBandBlender::prepare(Rect dst_roi)
     }
 }
 
-
-void MultiBandBlender::feed(const Mat &img, const Mat &mask, Point tl)
+#ifdef HAVE_OPENCL
+static bool ocl_MultiBandBlender_feed(InputArray _src, InputArray _weight,
+        InputOutputArray _dst, InputOutputArray _dst_weight)
 {
+    String buildOptions = "-D DEFINE_feed";
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "src", _src);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "weight", _weight);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "dst", _dst);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "dstWeight", _dst_weight);
+    ocl::Kernel k("feed", ocl::stitching::multibandblend_oclsrc, buildOptions);
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+
+    k.args(ocl::KernelArg::ReadOnly(src),
+           ocl::KernelArg::ReadOnly(_weight.getUMat()),
+           ocl::KernelArg::ReadWrite(_dst.getUMat()),
+           ocl::KernelArg::ReadWrite(_dst_weight.getUMat())
+           );
+
+    size_t globalsize[2] = {src.cols, src.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+#endif
+
+void MultiBandBlender::feed(InputArray _img, InputArray mask, Point tl)
+{
+#if ENABLE_LOG
+    int64 t = getTickCount();
+#endif
+
+    UMat img = _img.getUMat();
     CV_Assert(img.type() == CV_16SC3 || img.type() == CV_8UC3);
     CV_Assert(mask.type() == CV_8U);
 
@@ -269,27 +313,39 @@ void MultiBandBlender::feed(const Mat &img, const Mat &mask, Point tl)
     int right = br_new.x - tl.x - img.cols;
 
     // Create the source image Laplacian pyramid
-    Mat img_with_border;
-    copyMakeBorder(img, img_with_border, top, bottom, left, right,
+    UMat img_with_border;
+    copyMakeBorder(_img, img_with_border, top, bottom, left, right,
                    BORDER_REFLECT);
-    std::vector<Mat> src_pyr_laplace;
+    LOGLN("  Add border to the source image, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+    t = getTickCount();
+#endif
+
+    std::vector<UMat> src_pyr_laplace;
     if (can_use_gpu_ && img_with_border.depth() == CV_16S)
         createLaplacePyrGpu(img_with_border, num_bands_, src_pyr_laplace);
     else
         createLaplacePyr(img_with_border, num_bands_, src_pyr_laplace);
 
+    LOGLN("  Create the source image Laplacian pyramid, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+    t = getTickCount();
+#endif
+
     // Create the weight map Gaussian pyramid
-    Mat weight_map;
-    std::vector<Mat> weight_pyr_gauss(num_bands_ + 1);
+    UMat weight_map;
+    std::vector<UMat> weight_pyr_gauss(num_bands_ + 1);
 
     if(weight_type_ == CV_32F)
     {
-        mask.convertTo(weight_map, CV_32F, 1./255.);
+        mask.getUMat().convertTo(weight_map, CV_32F, 1./255.);
     }
-    else// weight_type_ == CV_16S
+    else // weight_type_ == CV_16S
     {
-        mask.convertTo(weight_map, CV_16S);
-        add(weight_map, 1, weight_map, mask != 0);
+        mask.getUMat().convertTo(weight_map, CV_16S);
+        UMat add_mask;
+        compare(mask, 0, add_mask, CMP_NE);
+        add(weight_map, Scalar::all(1), weight_map, add_mask);
     }
 
     copyMakeBorder(weight_map, weight_pyr_gauss[0], top, bottom, left, right, BORDER_CONSTANT);
@@ -297,66 +353,77 @@ void MultiBandBlender::feed(const Mat &img, const Mat &mask, Point tl)
     for (int i = 0; i < num_bands_; ++i)
         pyrDown(weight_pyr_gauss[i], weight_pyr_gauss[i + 1]);
 
+    LOGLN("  Create the weight map Gaussian pyramid, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+    t = getTickCount();
+#endif
+
     int y_tl = tl_new.y - dst_roi_.y;
     int y_br = br_new.y - dst_roi_.y;
     int x_tl = tl_new.x - dst_roi_.x;
     int x_br = br_new.x - dst_roi_.x;
 
     // Add weighted layer of the source image to the final Laplacian pyramid layer
-    if(weight_type_ == CV_32F)
+    for (int i = 0; i <= num_bands_; ++i)
     {
-        for (int i = 0; i <= num_bands_; ++i)
+        Rect rc(x_tl, y_tl, x_br - x_tl, y_br - y_tl);
+#ifdef HAVE_OPENCL
+        if ( !cv::ocl::useOpenCL() ||
+             !ocl_MultiBandBlender_feed(src_pyr_laplace[i], weight_pyr_gauss[i],
+                    dst_pyr_laplace_[i](rc), dst_band_weights_[i](rc)) )
+#endif
         {
-            for (int y = y_tl; y < y_br; ++y)
+            Mat _src_pyr_laplace = src_pyr_laplace[i].getMat(ACCESS_READ);
+            Mat _dst_pyr_laplace = dst_pyr_laplace_[i](rc).getMat(ACCESS_RW);
+            Mat _weight_pyr_gauss = weight_pyr_gauss[i].getMat(ACCESS_READ);
+            Mat _dst_band_weights = dst_band_weights_[i](rc).getMat(ACCESS_RW);
+            if(weight_type_ == CV_32F)
             {
-                int y_ = y - y_tl;
-                const Point3_<short>* src_row = src_pyr_laplace[i].ptr<Point3_<short> >(y_);
-                Point3_<short>* dst_row = dst_pyr_laplace_[i].ptr<Point3_<short> >(y);
-                const float* weight_row = weight_pyr_gauss[i].ptr<float>(y_);
-                float* dst_weight_row = dst_band_weights_[i].ptr<float>(y);
-
-                for (int x = x_tl; x < x_br; ++x)
+                for (int y = 0; y < rc.height; ++y)
                 {
-                    int x_ = x - x_tl;
-                    dst_row[x].x += static_cast<short>(src_row[x_].x * weight_row[x_]);
-                    dst_row[x].y += static_cast<short>(src_row[x_].y * weight_row[x_]);
-                    dst_row[x].z += static_cast<short>(src_row[x_].z * weight_row[x_]);
-                    dst_weight_row[x] += weight_row[x_];
+                    const Point3_<short>* src_row = _src_pyr_laplace.ptr<Point3_<short> >(y);
+                    Point3_<short>* dst_row = _dst_pyr_laplace.ptr<Point3_<short> >(y);
+                    const float* weight_row = _weight_pyr_gauss.ptr<float>(y);
+                    float* dst_weight_row = _dst_band_weights.ptr<float>(y);
+
+                    for (int x = 0; x < rc.width; ++x)
+                    {
+                        dst_row[x].x += static_cast<short>(src_row[x].x * weight_row[x]);
+                        dst_row[x].y += static_cast<short>(src_row[x].y * weight_row[x]);
+                        dst_row[x].z += static_cast<short>(src_row[x].z * weight_row[x]);
+                        dst_weight_row[x] += weight_row[x];
+                    }
                 }
             }
-            x_tl /= 2; y_tl /= 2;
-            x_br /= 2; y_br /= 2;
-        }
-    }
-    else// weight_type_ == CV_16S
-    {
-        for (int i = 0; i <= num_bands_; ++i)
-        {
-            for (int y = y_tl; y < y_br; ++y)
+            else // weight_type_ == CV_16S
             {
-                int y_ = y - y_tl;
-                const Point3_<short>* src_row = src_pyr_laplace[i].ptr<Point3_<short> >(y_);
-                Point3_<short>* dst_row = dst_pyr_laplace_[i].ptr<Point3_<short> >(y);
-                const short* weight_row = weight_pyr_gauss[i].ptr<short>(y_);
-                short* dst_weight_row = dst_band_weights_[i].ptr<short>(y);
-
-                for (int x = x_tl; x < x_br; ++x)
+                for (int y = 0; y < y_br - y_tl; ++y)
                 {
-                    int x_ = x - x_tl;
-                    dst_row[x].x += short((src_row[x_].x * weight_row[x_]) >> 8);
-                    dst_row[x].y += short((src_row[x_].y * weight_row[x_]) >> 8);
-                    dst_row[x].z += short((src_row[x_].z * weight_row[x_]) >> 8);
-                    dst_weight_row[x] += weight_row[x_];
+                    const Point3_<short>* src_row = _src_pyr_laplace.ptr<Point3_<short> >(y);
+                    Point3_<short>* dst_row = _dst_pyr_laplace.ptr<Point3_<short> >(y);
+                    const short* weight_row = _weight_pyr_gauss.ptr<short>(y);
+                    short* dst_weight_row = _dst_band_weights.ptr<short>(y);
+
+                    for (int x = 0; x < x_br - x_tl; ++x)
+                    {
+                        dst_row[x].x += short((src_row[x].x * weight_row[x]) >> 8);
+                        dst_row[x].y += short((src_row[x].y * weight_row[x]) >> 8);
+                        dst_row[x].z += short((src_row[x].z * weight_row[x]) >> 8);
+                        dst_weight_row[x] += weight_row[x];
+                    }
                 }
             }
-            x_tl /= 2; y_tl /= 2;
-            x_br /= 2; y_br /= 2;
         }
+
+        x_tl /= 2; y_tl /= 2;
+        x_br /= 2; y_br /= 2;
     }
+
+    LOGLN("  Add weighted layer of the source image to the final Laplacian pyramid layer, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
 }
 
 
-void MultiBandBlender::blend(Mat &dst, Mat &dst_mask)
+void MultiBandBlender::blend(InputOutputArray dst, InputOutputArray dst_mask)
 {
     for (int i = 0; i <= num_bands_; ++i)
         normalizeUsingWeightMap(dst_band_weights_[i], dst_pyr_laplace_[i]);
@@ -366,10 +433,10 @@ void MultiBandBlender::blend(Mat &dst, Mat &dst_mask)
     else
         restoreImageFromLaplacePyr(dst_pyr_laplace_);
 
-    dst_ = dst_pyr_laplace_[0];
-    dst_ = dst_(Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width));
-    dst_mask_ = dst_band_weights_[0] > WEIGHT_EPS;
-    dst_mask_ = dst_mask_(Range(0, dst_roi_final_.height), Range(0, dst_roi_final_.width));
+    Rect dst_rc(0, 0, dst_roi_final_.width, dst_roi_final_.height);
+    dst_ = dst_pyr_laplace_[0](dst_rc);
+    UMat _dst_mask;
+    compare(dst_band_weights_[0](dst_rc), WEIGHT_EPS, dst_mask_, CMP_GT);
     dst_pyr_laplace_.clear();
     dst_band_weights_.clear();
 
@@ -380,59 +447,92 @@ void MultiBandBlender::blend(Mat &dst, Mat &dst_mask)
 //////////////////////////////////////////////////////////////////////////////
 // Auxiliary functions
 
-void normalizeUsingWeightMap(const Mat& weight, Mat& src)
+#ifdef HAVE_OPENCL
+static bool ocl_normalizeUsingWeightMap(InputArray _weight, InputOutputArray _mat)
+{
+    String buildOptions = "-D DEFINE_normalizeUsingWeightMap";
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "mat", _mat);
+    ocl::buildOptionsAddMatrixDescription(buildOptions, "weight", _weight);
+    ocl::Kernel k("normalizeUsingWeightMap", ocl::stitching::multibandblend_oclsrc, buildOptions);
+    if (k.empty())
+        return false;
+
+    UMat mat = _mat.getUMat();
+
+    k.args(ocl::KernelArg::ReadWrite(mat),
+           ocl::KernelArg::ReadOnly(_weight.getUMat())
+           );
+
+    size_t globalsize[2] = {mat.cols, mat.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+#endif
+
+void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src)
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if(tegra::normalizeUsingWeightMap(weight, src))
         return;
 #endif
-    CV_Assert(src.type() == CV_16SC3);
 
-    if(weight.type() == CV_32FC1)
+#ifdef HAVE_OPENCL
+        if ( !cv::ocl::useOpenCL() ||
+             !ocl_normalizeUsingWeightMap(_weight, _src) )
+#endif
     {
-        for (int y = 0; y < src.rows; ++y)
-        {
-            Point3_<short> *row = src.ptr<Point3_<short> >(y);
-            const float *weight_row = weight.ptr<float>(y);
+        Mat weight = _weight.getMat();
+        Mat src = _src.getMat();
 
-            for (int x = 0; x < src.cols; ++x)
+        CV_Assert(src.type() == CV_16SC3);
+
+        if(weight.type() == CV_32FC1)
+        {
+            for (int y = 0; y < src.rows; ++y)
             {
-                row[x].x = static_cast<short>(row[x].x / (weight_row[x] + WEIGHT_EPS));
-                row[x].y = static_cast<short>(row[x].y / (weight_row[x] + WEIGHT_EPS));
-                row[x].z = static_cast<short>(row[x].z / (weight_row[x] + WEIGHT_EPS));
+                Point3_<short> *row = src.ptr<Point3_<short> >(y);
+                const float *weight_row = weight.ptr<float>(y);
+
+                for (int x = 0; x < src.cols; ++x)
+                {
+                    row[x].x = static_cast<short>(row[x].x / (weight_row[x] + WEIGHT_EPS));
+                    row[x].y = static_cast<short>(row[x].y / (weight_row[x] + WEIGHT_EPS));
+                    row[x].z = static_cast<short>(row[x].z / (weight_row[x] + WEIGHT_EPS));
+                }
             }
         }
-    }
-    else
-    {
-        CV_Assert(weight.type() == CV_16SC1);
-
-        for (int y = 0; y < src.rows; ++y)
+        else
         {
-            const short *weight_row = weight.ptr<short>(y);
-            Point3_<short> *row = src.ptr<Point3_<short> >(y);
+            CV_Assert(weight.type() == CV_16SC1);
 
-            for (int x = 0; x < src.cols; ++x)
+            for (int y = 0; y < src.rows; ++y)
             {
-                int w = weight_row[x] + 1;
-                row[x].x = static_cast<short>((row[x].x << 8) / w);
-                row[x].y = static_cast<short>((row[x].y << 8) / w);
-                row[x].z = static_cast<short>((row[x].z << 8) / w);
+                const short *weight_row = weight.ptr<short>(y);
+                Point3_<short> *row = src.ptr<Point3_<short> >(y);
+
+                for (int x = 0; x < src.cols; ++x)
+                {
+                    int w = weight_row[x] + 1;
+                    row[x].x = static_cast<short>((row[x].x << 8) / w);
+                    row[x].y = static_cast<short>((row[x].y << 8) / w);
+                    row[x].z = static_cast<short>((row[x].z << 8) / w);
+                }
             }
         }
     }
 }
 
 
-void createWeightMap(const Mat &mask, float sharpness, Mat &weight)
+void createWeightMap(InputArray mask, float sharpness, InputOutputArray weight)
 {
     CV_Assert(mask.type() == CV_8U);
     distanceTransform(mask, weight, DIST_L1, 3);
-    threshold(weight * sharpness, weight, 1.f, 1.f, THRESH_TRUNC);
+    UMat tmp;
+    multiply(weight, sharpness, tmp);
+    threshold(tmp, weight, 1.f, 1.f, THRESH_TRUNC);
 }
 
 
-void createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat> &pyr)
+void createLaplacePyr(InputArray img, int num_levels, std::vector<UMat> &pyr)
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if(tegra::createLaplacePyr(img, num_levels, pyr))
@@ -445,18 +545,18 @@ void createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat> &pyr)
     {
         if(num_levels == 0)
         {
-            img.convertTo(pyr[0], CV_16S);
+            img.getUMat().convertTo(pyr[0], CV_16S);
             return;
         }
 
-        Mat downNext;
-        Mat current = img;
+        UMat downNext;
+        UMat current = img.getUMat();
         pyrDown(img, downNext);
 
         for(int i = 1; i < num_levels; ++i)
         {
-            Mat lvl_up;
-            Mat lvl_down;
+            UMat lvl_up;
+            UMat lvl_down;
 
             pyrDown(downNext, lvl_down);
             pyrUp(downNext, lvl_up, current.size());
@@ -467,7 +567,7 @@ void createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat> &pyr)
         }
 
         {
-            Mat lvl_up;
+            UMat lvl_up;
             pyrUp(downNext, lvl_up, current.size());
             subtract(current, lvl_up, pyr[num_levels-1], noArray(), CV_16S);
 
@@ -476,10 +576,10 @@ void createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat> &pyr)
     }
     else
     {
-        pyr[0] = img;
+        pyr[0] = img.getUMat();
         for (int i = 0; i < num_levels; ++i)
             pyrDown(pyr[i], pyr[i + 1]);
-        Mat tmp;
+        UMat tmp;
         for (int i = 0; i < num_levels; ++i)
         {
             pyrUp(pyr[i + 1], tmp, pyr[i].size());
@@ -489,7 +589,7 @@ void createLaplacePyr(const Mat &img, int num_levels, std::vector<Mat> &pyr)
 }
 
 
-void createLaplacePyrGpu(const Mat &img, int num_levels, std::vector<Mat> &pyr)
+void createLaplacePyrGpu(InputArray img, int num_levels, std::vector<UMat> &pyr)
 {
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     pyr.resize(num_levels + 1);
@@ -517,11 +617,11 @@ void createLaplacePyrGpu(const Mat &img, int num_levels, std::vector<Mat> &pyr)
 }
 
 
-void restoreImageFromLaplacePyr(std::vector<Mat> &pyr)
+void restoreImageFromLaplacePyr(std::vector<UMat> &pyr)
 {
     if (pyr.empty())
         return;
-    Mat tmp;
+    UMat tmp;
     for (size_t i = pyr.size() - 1; i > 0; --i)
     {
         pyrUp(pyr[i], tmp, pyr[i - 1].size());
@@ -530,7 +630,7 @@ void restoreImageFromLaplacePyr(std::vector<Mat> &pyr)
 }
 
 
-void restoreImageFromLaplacePyrGpu(std::vector<Mat> &pyr)
+void restoreImageFromLaplacePyrGpu(std::vector<UMat> &pyr)
 {
 #if defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING)
     if (pyr.empty())
diff --git a/modules/stitching/src/exposure_compensate.cpp b/modules/stitching/src/exposure_compensate.cpp
index 78ce6d371..1f04fff9f 100644
--- a/modules/stitching/src/exposure_compensate.cpp
+++ b/modules/stitching/src/exposure_compensate.cpp
@@ -58,18 +58,18 @@ Ptr<ExposureCompensator> ExposureCompensator::createDefault(int type)
 }
 
 
-void ExposureCompensator::feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-                               const std::vector<Mat> &masks)
+void ExposureCompensator::feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+                               const std::vector<UMat> &masks)
 {
-    std::vector<std::pair<Mat,uchar> > level_masks;
+    std::vector<std::pair<UMat,uchar> > level_masks;
     for (size_t i = 0; i < masks.size(); ++i)
         level_masks.push_back(std::make_pair(masks[i], 255));
     feed(corners, images, level_masks);
 }
 
 
-void GainCompensator::feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-                           const std::vector<std::pair<Mat,uchar> > &masks)
+void GainCompensator::feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+                           const std::vector<std::pair<UMat,uchar> > &masks)
 {
     LOGLN("Exposure compensation...");
 #if ENABLE_LOG
@@ -93,11 +93,11 @@ void GainCompensator::feed(const std::vector<Point> &corners, const std::vector<
             Rect roi;
             if (overlapRoi(corners[i], corners[j], images[i].size(), images[j].size(), roi))
             {
-                subimg1 = images[i](Rect(roi.tl() - corners[i], roi.br() - corners[i]));
-                subimg2 = images[j](Rect(roi.tl() - corners[j], roi.br() - corners[j]));
+                subimg1 = images[i](Rect(roi.tl() - corners[i], roi.br() - corners[i])).getMat(ACCESS_READ);
+                subimg2 = images[j](Rect(roi.tl() - corners[j], roi.br() - corners[j])).getMat(ACCESS_READ);
 
-                submask1 = masks[i].first(Rect(roi.tl() - corners[i], roi.br() - corners[i]));
-                submask2 = masks[j].first(Rect(roi.tl() - corners[j], roi.br() - corners[j]));
+                submask1 = masks[i].first(Rect(roi.tl() - corners[i], roi.br() - corners[i])).getMat(ACCESS_READ);
+                submask2 = masks[j].first(Rect(roi.tl() - corners[j], roi.br() - corners[j])).getMat(ACCESS_READ);
                 intersect = (submask1 == masks[i].second) & (submask2 == masks[j].second);
 
                 N(i, j) = N(j, i) = std::max(1, countNonZero(intersect));
@@ -145,9 +145,9 @@ void GainCompensator::feed(const std::vector<Point> &corners, const std::vector<
 }
 
 
-void GainCompensator::apply(int index, Point /*corner*/, Mat &image, const Mat &/*mask*/)
+void GainCompensator::apply(int index, Point /*corner*/, InputOutputArray image, InputArray /*mask*/)
 {
-    image *= gains_(index, 0);
+    multiply(image, gains_(index, 0), image);
 }
 
 
@@ -160,8 +160,8 @@ std::vector<double> GainCompensator::gains() const
 }
 
 
-void BlocksGainCompensator::feed(const std::vector<Point> &corners, const std::vector<Mat> &images,
-                                     const std::vector<std::pair<Mat,uchar> > &masks)
+void BlocksGainCompensator::feed(const std::vector<Point> &corners, const std::vector<UMat> &images,
+                                     const std::vector<std::pair<UMat,uchar> > &masks)
 {
     CV_Assert(corners.size() == images.size() && images.size() == masks.size());
 
@@ -169,8 +169,8 @@ void BlocksGainCompensator::feed(const std::vector<Point> &corners, const std::v
 
     std::vector<Size> bl_per_imgs(num_images);
     std::vector<Point> block_corners;
-    std::vector<Mat> block_images;
-    std::vector<std::pair<Mat,uchar> > block_masks;
+    std::vector<UMat> block_images;
+    std::vector<std::pair<UMat,uchar> > block_masks;
 
     // Construct blocks for gain compensator
     for (int img_idx = 0; img_idx < num_images; ++img_idx)
@@ -208,11 +208,14 @@ void BlocksGainCompensator::feed(const std::vector<Point> &corners, const std::v
     for (int img_idx = 0; img_idx < num_images; ++img_idx)
     {
         Size bl_per_img = bl_per_imgs[img_idx];
-        gain_maps_[img_idx].create(bl_per_img);
+        gain_maps_[img_idx].create(bl_per_img, CV_32F);
 
-        for (int by = 0; by < bl_per_img.height; ++by)
-            for (int bx = 0; bx < bl_per_img.width; ++bx, ++bl_idx)
-                gain_maps_[img_idx](by, bx) = static_cast<float>(gains[bl_idx]);
+        {
+            Mat_<float> gain_map = gain_maps_[img_idx].getMat(ACCESS_WRITE);
+            for (int by = 0; by < bl_per_img.height; ++by)
+                for (int bx = 0; bx < bl_per_img.width; ++bx, ++bl_idx)
+                    gain_map(by, bx) = static_cast<float>(gains[bl_idx]);
+        }
 
         sepFilter2D(gain_maps_[img_idx], gain_maps_[img_idx], CV_32F, ker, ker);
         sepFilter2D(gain_maps_[img_idx], gain_maps_[img_idx], CV_32F, ker, ker);
@@ -220,16 +223,18 @@ void BlocksGainCompensator::feed(const std::vector<Point> &corners, const std::v
 }
 
 
-void BlocksGainCompensator::apply(int index, Point /*corner*/, Mat &image, const Mat &/*mask*/)
+void BlocksGainCompensator::apply(int index, Point /*corner*/, InputOutputArray _image, InputArray /*mask*/)
 {
-    CV_Assert(image.type() == CV_8UC3);
+    CV_Assert(_image.type() == CV_8UC3);
 
-    Mat_<float> gain_map;
-    if (gain_maps_[index].size() == image.size())
-        gain_map = gain_maps_[index];
+    UMat u_gain_map;
+    if (gain_maps_[index].size() == _image.size())
+        u_gain_map = gain_maps_[index];
     else
-        resize(gain_maps_[index], gain_map, image.size(), 0, 0, INTER_LINEAR);
+        resize(gain_maps_[index], u_gain_map, _image.size(), 0, 0, INTER_LINEAR);
 
+    Mat_<float> gain_map = u_gain_map.getMat(ACCESS_READ);
+    Mat image = _image.getMat();
     for (int y = 0; y < image.rows; ++y)
     {
         const float* gain_row = gain_map.ptr<float>(y);
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index f463518b6..c303c4aab 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -155,21 +155,31 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
 
     matches_info.matches.clear();
 
-    Ptr<flann::IndexParams> indexParams = makePtr<flann::KDTreeIndexParams>();
-    Ptr<flann::SearchParams> searchParams = makePtr<flann::SearchParams>();
-
-    if (features2.descriptors.depth() == CV_8U)
+    Ptr<DescriptorMatcher> matcher;
+#if 0 // TODO check this
+    if (ocl::useOpenCL())
     {
-        indexParams->setAlgorithm(cvflann::FLANN_INDEX_LSH);
-        searchParams->setAlgorithm(cvflann::FLANN_INDEX_LSH);
+        matcher = makePtr<BFMatcher>((int)NORM_L2);
     }
+    else
+#endif
+    {
+        Ptr<flann::IndexParams> indexParams = makePtr<flann::KDTreeIndexParams>();
+        Ptr<flann::SearchParams> searchParams = makePtr<flann::SearchParams>();
 
-    FlannBasedMatcher matcher(indexParams, searchParams);
+        if (features2.descriptors.depth() == CV_8U)
+        {
+            indexParams->setAlgorithm(cvflann::FLANN_INDEX_LSH);
+            searchParams->setAlgorithm(cvflann::FLANN_INDEX_LSH);
+        }
+
+        matcher = makePtr<FlannBasedMatcher>(indexParams, searchParams);
+    }
     std::vector< std::vector<DMatch> > pair_matches;
     MatchesSet matches;
 
     // Find 1->2 matches
-    matcher.knnMatch(features1.descriptors, features2.descriptors, pair_matches, 2);
+    matcher->knnMatch(features1.descriptors, features2.descriptors, pair_matches, 2);
     for (size_t i = 0; i < pair_matches.size(); ++i)
     {
         if (pair_matches[i].size() < 2)
@@ -186,7 +196,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
 
     // Find 2->1 matches
     pair_matches.clear();
-    matcher.knnMatch(features2.descriptors, features1.descriptors, pair_matches, 2);
+    matcher->knnMatch(features2.descriptors, features1.descriptors, pair_matches, 2);
     for (size_t i = 0; i < pair_matches.size(); ++i)
     {
         if (pair_matches[i].size() < 2)
@@ -264,14 +274,14 @@ void GpuMatcher::collectGarbage()
 namespace cv {
 namespace detail {
 
-void FeaturesFinder::operator ()(const Mat &image, ImageFeatures &features)
+void FeaturesFinder::operator ()(InputArray  image, ImageFeatures &features)
 {
     find(image, features);
     features.img_size = image.size();
 }
 
 
-void FeaturesFinder::operator ()(const Mat &image, ImageFeatures &features, const std::vector<Rect> &rois)
+void FeaturesFinder::operator ()(InputArray image, ImageFeatures &features, const std::vector<Rect> &rois)
 {
     std::vector<ImageFeatures> roi_features(rois.size());
     size_t total_kps_count = 0;
@@ -279,7 +289,7 @@ void FeaturesFinder::operator ()(const Mat &image, ImageFeatures &features, cons
 
     for (size_t i = 0; i < rois.size(); ++i)
     {
-        find(image(rois[i]), roi_features[i]);
+        find(image.getUMat()(rois[i]), roi_features[i]);
         total_kps_count += roi_features[i].keypoints.size();
         total_descriptors_height += roi_features[i].descriptors.rows;
     }
@@ -300,7 +310,7 @@ void FeaturesFinder::operator ()(const Mat &image, ImageFeatures &features, cons
             features.keypoints[kp_idx].pt.x += (float)rois[i].x;
             features.keypoints[kp_idx].pt.y += (float)rois[i].y;
         }
-        Mat subdescr = features.descriptors.rowRange(
+        UMat subdescr = features.descriptors.rowRange(
                 descr_offset, descr_offset + roi_features[i].descriptors.rows);
         roi_features[i].descriptors.copyTo(subdescr);
         descr_offset += roi_features[i].descriptors.rows;
@@ -337,9 +347,9 @@ SurfFeaturesFinder::SurfFeaturesFinder(double hess_thresh, int num_octaves, int
     }
 }
 
-void SurfFeaturesFinder::find(const Mat &image, ImageFeatures &features)
+void SurfFeaturesFinder::find(InputArray image, ImageFeatures &features)
 {
-    Mat gray_image;
+    UMat gray_image;
     CV_Assert((image.type() == CV_8UC3) || (image.type() == CV_8UC1));
     if(image.type() == CV_8UC3)
     {
@@ -347,7 +357,7 @@ void SurfFeaturesFinder::find(const Mat &image, ImageFeatures &features)
     }
     else
     {
-        gray_image = image;
+        gray_image = image.getUMat();
     }
     if (!surf)
     {
@@ -356,7 +366,7 @@ void SurfFeaturesFinder::find(const Mat &image, ImageFeatures &features)
     }
     else
     {
-        Mat descriptors;
+        UMat descriptors;
         (*surf)(gray_image, Mat(), features.keypoints, descriptors);
         features.descriptors = descriptors.reshape(1, (int)features.keypoints.size());
     }
@@ -368,9 +378,9 @@ OrbFeaturesFinder::OrbFeaturesFinder(Size _grid_size, int n_features, float scal
     orb = makePtr<ORB>(n_features * (99 + grid_size.area())/100/grid_size.area(), scaleFactor, nlevels);
 }
 
-void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
+void OrbFeaturesFinder::find(InputArray image, ImageFeatures &features)
 {
-    Mat gray_image;
+    UMat gray_image;
 
     CV_Assert((image.type() == CV_8UC3) || (image.type() == CV_8UC4) || (image.type() == CV_8UC1));
 
@@ -379,7 +389,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
     } else if (image.type() == CV_8UC4) {
         cvtColor(image, gray_image, COLOR_BGRA2GRAY);
     } else if (image.type() == CV_8UC1) {
-        gray_image=image;
+        gray_image = image.getUMat();
     } else {
         CV_Error(Error::StsUnsupportedFormat, "");
     }
@@ -392,7 +402,8 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
         features.descriptors.release();
 
         std::vector<KeyPoint> points;
-        Mat descriptors;
+        Mat _descriptors;
+        UMat descriptors;
 
         for (int r = 0; r < grid_size.height; ++r)
             for (int c = 0; c < grid_size.width; ++c)
@@ -408,13 +419,13 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
                 //     << " xl=" << xl << ", xr=" << xr << ", gray_image.data=" << ((size_t)gray_image.data) << ", "
                 //     << "gray_image.dims=" << gray_image.dims << "\n");
 
-                Mat gray_image_part=gray_image(Range(yl, yr), Range(xl, xr));
+                UMat gray_image_part=gray_image(Range(yl, yr), Range(xl, xr));
                 // LOGLN("OrbFeaturesFinder::find: gray_image_part.empty=" << (gray_image_part.empty()?"true":"false") << ", "
                 //     << " gray_image_part.size()=(" << gray_image_part.size().width << "x" << gray_image_part.size().height << "), "
                 //     << " gray_image_part.dims=" << gray_image_part.dims << ", "
                 //     << " gray_image_part.data=" << ((size_t)gray_image_part.data) << "\n");
 
-                (*orb)(gray_image_part, Mat(), points, descriptors);
+                (*orb)(gray_image_part, UMat(), points, descriptors);
 
                 features.keypoints.reserve(features.keypoints.size() + points.size());
                 for (std::vector<KeyPoint>::iterator kp = points.begin(); kp != points.end(); ++kp)
@@ -423,8 +434,12 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
                     kp->pt.y += yl;
                     features.keypoints.push_back(*kp);
                 }
-                features.descriptors.push_back(descriptors);
+                _descriptors.push_back(descriptors.getMat(ACCESS_READ));
             }
+
+        // TODO optimize copyTo()
+        //features.descriptors = _descriptors.getUMat(ACCESS_READ);
+        _descriptors.copyTo(features.descriptors);
     }
 }
 
@@ -442,7 +457,7 @@ SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves
 }
 
 
-void SurfFeaturesFinderGpu::find(const Mat &image, ImageFeatures &features)
+void SurfFeaturesFinderGpu::find(InputArray image, ImageFeatures &features)
 {
     CV_Assert(image.depth() == CV_8U);
 
@@ -499,12 +514,12 @@ const MatchesInfo& MatchesInfo::operator =(const MatchesInfo &other)
 //////////////////////////////////////////////////////////////////////////////
 
 void FeaturesMatcher::operator ()(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
-                                  const Mat &mask)
+                                  const UMat &mask)
 {
     const int num_images = static_cast<int>(features.size());
 
     CV_Assert(mask.empty() || (mask.type() == CV_8U && mask.cols == num_images && mask.rows));
-    Mat_<uchar> mask_(mask);
+    Mat_<uchar> mask_(mask.getMat(ACCESS_READ));
     if (mask_.empty())
         mask_ = Mat::ones(num_images, num_images, CV_8U);
 
diff --git a/modules/stitching/src/opencl/multibandblend.cl b/modules/stitching/src/opencl/multibandblend.cl
new file mode 100644
index 000000000..72d3de0fb
--- /dev/null
+++ b/modules/stitching/src/opencl/multibandblend.cl
@@ -0,0 +1,282 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
+
+//
+// Common preprocessors macro
+//
+
+//
+// TODO: Move this common code into "header" file
+//
+
+#ifndef NL // New Line: for preprocessor debugging
+#define NL
+#endif
+
+#define REF(x) x
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+
+//
+// All matrixes are come with this description ("name" is a name of matrix):
+// * name_CN - number of channels (1,2,3,4)
+// * name_DEPTH - numeric value of CV_MAT_DEPTH(type). See CV_8U, CV_32S, etc macro below.
+//
+// Currently we also pass these attributes (to reduce this macro block):
+// * name_T - datatype (int, float, uchar4, float4)
+// * name_T1 - datatype for one channel (int, float, uchar).
+//   It is equal to result of "T1(name_T)" macro
+// * name_TSIZE - CV_ELEM_SIZE(type).
+//   We can't use sizeof(name_T) here, because sizeof(float3) is usually equal to 8, not 6.
+// * name_T1SIZE - CV_ELEM_SIZE1(type)
+//
+
+//
+// Usage sample:
+//
+// #define workType TYPE(float, src_CN)
+// #define convertToWorkType CONVERT_TO(workType)
+// #define convertWorkTypeToDstType CONVERT(workType, dst_T)
+//
+// __kernel void kernelFn(DECLARE_MAT_ARG(src), DECLARE_MAT_ARG(dst))
+// {
+//     const int x = get_global_id(0);
+//     const int y = get_global_id(1);
+//
+//     if (x < srcWidth && y < srcHeight)
+//     {
+//         int src_byteOffset = MAT_BYTE_OFFSET(src, x, y);
+//         int dst_byteOffset = MAT_BYTE_OFFSET(dst, x, y);
+//         workType value = convertToWorkType(LOAD_MAT_AT(src, src_byteOffset));
+//
+//         ... value processing ...
+//
+//         STORE_MAT_AT(dst, dst_byteOffset, convertWorkTypeToDstType(value));
+//     }
+// }
+//
+
+#define DECLARE_MAT_ARG(name) \
+    __global uchar* restrict name ## Ptr, \
+    int name ## StepBytes, \
+    int name ## Offset, \
+    int name ## Height, \
+    int name ## Width NL
+
+#define MAT_BYTE_OFFSET(name, x, y) mad24((y)/* + name ## OffsetY*/, name ## StepBytes, ((x)/* + name ## OffsetX*/) * (int)(name ## _TSIZE) + name ## Offset)
+#define MAT_RELATIVE_BYTE_OFFSET(name, x, y) mad24(y, name ## StepBytes, (x) * (int)(name ## _TSIZE))
+
+#define __LOAD_MAT_AT(name, byteOffset) *((const __global name ## _T*)(name ## Ptr + (byteOffset)))
+#define __vload_CN__(name_cn) vload ## name_cn
+#define __vload_CN_(name_cn) __vload_CN__(name_cn)
+#define __vload_CN(name) __vload_CN_(name ## _CN)
+#define __LOAD_MAT_AT_vload(name, byteOffset) __vload_CN(name)(0, ((const __global name ## _T1*)(name ## Ptr + (byteOffset))))
+#define __LOAD_MAT_AT_1 __LOAD_MAT_AT
+#define __LOAD_MAT_AT_2 __LOAD_MAT_AT
+#define __LOAD_MAT_AT_3 __LOAD_MAT_AT_vload
+#define __LOAD_MAT_AT_4 __LOAD_MAT_AT
+#define __LOAD_MAT_AT_CN__(name_cn) __LOAD_MAT_AT_ ## name_cn
+#define __LOAD_MAT_AT_CN_(name_cn) __LOAD_MAT_AT_CN__(name_cn)
+#define __LOAD_MAT_AT_CN(name) __LOAD_MAT_AT_CN_(name ## _CN)
+#define LOAD_MAT_AT(name, byteOffset) __LOAD_MAT_AT_CN(name)(name, byteOffset)
+
+#define __STORE_MAT_AT(name, byteOffset, v) *((__global name ## _T*)(name ## Ptr + (byteOffset))) = v
+#define __vstore_CN__(name_cn) vstore ## name_cn
+#define __vstore_CN_(name_cn) __vstore_CN__(name_cn)
+#define __vstore_CN(name) __vstore_CN_(name ## _CN)
+#define __STORE_MAT_AT_vstore(name, byteOffset, v) __vstore_CN(name)(v, 0, ((__global name ## _T1*)(name ## Ptr + (byteOffset))))
+#define __STORE_MAT_AT_1 __STORE_MAT_AT
+#define __STORE_MAT_AT_2 __STORE_MAT_AT
+#define __STORE_MAT_AT_3 __STORE_MAT_AT_vstore
+#define __STORE_MAT_AT_4 __STORE_MAT_AT
+#define __STORE_MAT_AT_CN__(name_cn) __STORE_MAT_AT_ ## name_cn
+#define __STORE_MAT_AT_CN_(name_cn) __STORE_MAT_AT_CN__(name_cn)
+#define __STORE_MAT_AT_CN(name) __STORE_MAT_AT_CN_(name ## _CN)
+#define STORE_MAT_AT(name, byteOffset, v) __STORE_MAT_AT_CN(name)(name, byteOffset, v)
+
+#define T1_uchar uchar
+#define T1_uchar2 uchar
+#define T1_uchar3 uchar
+#define T1_uchar4 uchar
+#define T1_char char
+#define T1_char2 char
+#define T1_char3 char
+#define T1_char4 char
+#define T1_ushort ushort
+#define T1_ushort2 ushort
+#define T1_ushort3 ushort
+#define T1_ushort4 ushort
+#define T1_short short
+#define T1_short2 short
+#define T1_short3 short
+#define T1_short4 short
+#define T1_int int
+#define T1_int2 int
+#define T1_int3 int
+#define T1_int4 int
+#define T1_float float
+#define T1_float2 float
+#define T1_float3 float
+#define T1_float4 float
+#define T1_double double
+#define T1_double2 double
+#define T1_double3 double
+#define T1_double4 double
+#define T1(type) REF(CAT(T1_, REF(type)))
+
+#define uchar1 uchar
+#define char1 char
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define float1 float
+#define double1 double
+#define TYPE(type, cn) REF(CAT(REF(type), REF(cn)))
+
+#define __CONVERT_MODE_uchar_uchar __NO_CONVERT
+#define __CONVERT_MODE_uchar_char __CONVERT_sat
+#define __CONVERT_MODE_uchar_ushort __CONVERT
+#define __CONVERT_MODE_uchar_short __CONVERT
+#define __CONVERT_MODE_uchar_int __CONVERT
+#define __CONVERT_MODE_uchar_float __CONVERT
+#define __CONVERT_MODE_uchar_double __CONVERT
+#define __CONVERT_MODE_char_uchar __CONVERT_sat
+#define __CONVERT_MODE_char_char __NO_CONVERT
+#define __CONVERT_MODE_char_ushort __CONVERT_sat
+#define __CONVERT_MODE_char_short __CONVERT
+#define __CONVERT_MODE_char_int __CONVERT
+#define __CONVERT_MODE_char_float __CONVERT
+#define __CONVERT_MODE_char_double __CONVERT
+#define __CONVERT_MODE_ushort_uchar __CONVERT_sat
+#define __CONVERT_MODE_ushort_char __CONVERT_sat
+#define __CONVERT_MODE_ushort_ushort __NO_CONVERT
+#define __CONVERT_MODE_ushort_short __CONVERT_sat
+#define __CONVERT_MODE_ushort_int __CONVERT
+#define __CONVERT_MODE_ushort_float __CONVERT
+#define __CONVERT_MODE_ushort_double __CONVERT
+#define __CONVERT_MODE_short_uchar __CONVERT_sat
+#define __CONVERT_MODE_short_char __CONVERT_sat
+#define __CONVERT_MODE_short_ushort __CONVERT_sat
+#define __CONVERT_MODE_short_short __NO_CONVERT
+#define __CONVERT_MODE_short_int __CONVERT
+#define __CONVERT_MODE_short_float __CONVERT
+#define __CONVERT_MODE_short_double __CONVERT
+#define __CONVERT_MODE_int_uchar __CONVERT_sat
+#define __CONVERT_MODE_int_char __CONVERT_sat
+#define __CONVERT_MODE_int_ushort __CONVERT_sat
+#define __CONVERT_MODE_int_short __CONVERT_sat
+#define __CONVERT_MODE_int_int __NO_CONVERT
+#define __CONVERT_MODE_int_float __CONVERT
+#define __CONVERT_MODE_int_double __CONVERT
+#define __CONVERT_MODE_float_uchar __CONVERT_sat_rte
+#define __CONVERT_MODE_float_char __CONVERT_sat_rte
+#define __CONVERT_MODE_float_ushort __CONVERT_sat_rte
+#define __CONVERT_MODE_float_short __CONVERT_sat_rte
+#define __CONVERT_MODE_float_int __CONVERT_rte
+#define __CONVERT_MODE_float_float __NO_CONVERT
+#define __CONVERT_MODE_float_double __CONVERT
+#define __CONVERT_MODE_double_uchar __CONVERT_sat_rte
+#define __CONVERT_MODE_double_char __CONVERT_sat_rte
+#define __CONVERT_MODE_double_ushort __CONVERT_sat_rte
+#define __CONVERT_MODE_double_short __CONVERT_sat_rte
+#define __CONVERT_MODE_double_int __CONVERT_rte
+#define __CONVERT_MODE_double_float __CONVERT
+#define __CONVERT_MODE_double_double __NO_CONVERT
+#define __CONVERT_MODE(srcType, dstType) CAT(__CONVERT_MODE_, CAT(REF(T1(srcType)), CAT(_, REF(T1(dstType)))))
+
+#define __ROUND_MODE__NO_CONVERT
+#define __ROUND_MODE__CONVERT // nothing
+#define __ROUND_MODE__CONVERT_rte _rte
+#define __ROUND_MODE__CONVERT_sat _sat
+#define __ROUND_MODE__CONVERT_sat_rte _sat_rte
+#define ROUND_MODE(srcType, dstType) CAT(__ROUND_MODE_, __CONVERT_MODE(srcType, dstType))
+
+#define __CONVERT_ROUND(dstType, roundMode) CAT(CAT(convert_, REF(dstType)), roundMode)
+#define __NO_CONVERT(dstType) // nothing
+#define __CONVERT(dstType) __CONVERT_ROUND(dstType,)
+#define __CONVERT_rte(dstType) __CONVERT_ROUND(dstType,_rte)
+#define __CONVERT_sat(dstType) __CONVERT_ROUND(dstType,_sat)
+#define __CONVERT_sat_rte(dstType) __CONVERT_ROUND(dstType,_sat_rte)
+#define CONVERT(srcType, dstType) REF(__CONVERT_MODE(srcType,dstType))(dstType)
+#define CONVERT_TO(dstType) __CONVERT_ROUND(dstType,)
+
+// OpenCV depths
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+
+//
+// End of common preprocessors macro
+//
+
+
+
+#if defined(DEFINE_feed)
+
+#define workType TYPE(weight_T1, src_CN)
+#define convertSrcToWorkType CONVERT_TO(workType)
+#define convertToDstType CONVERT_TO(dst_T) // sat_rte provides incompatible results with CPU path
+
+__kernel void feed(
+        DECLARE_MAT_ARG(src), DECLARE_MAT_ARG(weight),
+        DECLARE_MAT_ARG(dst), DECLARE_MAT_ARG(dstWeight)
+)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < srcWidth && y < srcHeight)
+    {
+        int src_byteOffset = MAT_BYTE_OFFSET(src, x, y);
+        int weight_byteOffset = MAT_BYTE_OFFSET(weight, x, y);
+        int dst_byteOffset = MAT_BYTE_OFFSET(dst, x, y);
+        int dstWeight_byteOffset = MAT_BYTE_OFFSET(dstWeight, x, y);
+
+        weight_T w = LOAD_MAT_AT(weight, weight_byteOffset);
+        workType src_value = convertSrcToWorkType(LOAD_MAT_AT(src, src_byteOffset));
+        STORE_MAT_AT(dst, dst_byteOffset, LOAD_MAT_AT(dst, dst_byteOffset) + convertToDstType(src_value * w));
+        STORE_MAT_AT(dstWeight, dstWeight_byteOffset, LOAD_MAT_AT(dstWeight, dstWeight_byteOffset) + w);
+    }
+}
+
+#endif
+
+#if defined(DEFINE_normalizeUsingWeightMap)
+
+#define workType TYPE(weight_T1, mat_CN)
+#define convertSrcToWorkType CONVERT_TO(workType)
+#define convertToDstType CONVERT_TO(mat_T) // sat_rte provides incompatible results with CPU path
+
+#if weight_DEPTH >= CV_32F
+#define WEIGHT_EPS 1e-5f
+#else
+#define WEIGHT_EPS 0
+#endif
+
+__kernel void normalizeUsingWeightMap(
+        DECLARE_MAT_ARG(mat), DECLARE_MAT_ARG(weight)
+)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < matWidth && y < matHeight)
+    {
+        int mat_byteOffset = MAT_BYTE_OFFSET(mat, x, y);
+        int weight_byteOffset = MAT_BYTE_OFFSET(weight, x, y);
+
+        weight_T w = LOAD_MAT_AT(weight, weight_byteOffset);
+        workType value = convertSrcToWorkType(LOAD_MAT_AT(mat, mat_byteOffset));
+        value = value / (w + WEIGHT_EPS);
+        STORE_MAT_AT(mat, mat_byteOffset, convertToDstType(value));
+    }
+}
+
+#endif
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 499202fa0..18ce41309 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -51,6 +51,7 @@
 #include <set>
 #include <functional>
 #include <sstream>
+#include <iostream>
 #include <cmath>
 #include "opencv2/core.hpp"
 #include "opencv2/core/ocl.hpp"
diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp
index 2e5117e5b..4d5c8d163 100644
--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -46,8 +46,8 @@
 namespace cv {
 namespace detail {
 
-void PairwiseSeamFinder::find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                              std::vector<Mat> &masks)
+void PairwiseSeamFinder::find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                              std::vector<UMat> &masks)
 {
     LOGLN("Finding seams...");
     if (src.size() == 0)
@@ -84,7 +84,7 @@ void PairwiseSeamFinder::run()
 
 
 void VoronoiSeamFinder::find(const std::vector<Size> &sizes, const std::vector<Point> &corners,
-                             std::vector<Mat> &masks)
+                             std::vector<UMat> &masks)
 {
     LOGLN("Finding seams...");
     if (sizes.size() == 0)
@@ -110,7 +110,7 @@ void VoronoiSeamFinder::findInPair(size_t first, size_t second, Rect roi)
     Mat submask2(roi.height + 2 * gap, roi.width + 2 * gap, CV_8U);
 
     Size img1 = sizes_[first], img2 = sizes_[second];
-    Mat mask1 = masks_[first], mask2 = masks_[second];
+    Mat mask1 = masks_[first].getMat(ACCESS_READ), mask2 = masks_[second].getMat(ACCESS_READ);
     Point tl1 = corners_[first], tl2 = corners_[second];
 
     // Cut submasks with some gap
@@ -160,7 +160,7 @@ void VoronoiSeamFinder::findInPair(size_t first, size_t second, Rect roi)
 DpSeamFinder::DpSeamFinder(CostFunction costFunc) : costFunc_(costFunc) {}
 
 
-void DpSeamFinder::find(const std::vector<Mat> &src, const std::vector<Point> &corners, std::vector<Mat> &masks)
+void DpSeamFinder::find(const std::vector<UMat> &src, const std::vector<Point> &corners, std::vector<UMat> &masks)
 {
     LOGLN("Finding seams...");
 #if ENABLE_LOG
@@ -176,13 +176,18 @@ void DpSeamFinder::find(const std::vector<Mat> &src, const std::vector<Point> &c
         for (size_t j = i+1; j < src.size(); ++j)
             pairs.push_back(std::make_pair(i, j));
 
-    sort(pairs.begin(), pairs.end(), ImagePairLess(src, corners));
+    {
+        std::vector<Mat> _src(src.size());
+        for (size_t i = 0; i < src.size(); ++i) _src[i] = src[i].getMat(ACCESS_READ);
+        sort(pairs.begin(), pairs.end(), ImagePairLess(_src, corners));
+    }
     std::reverse(pairs.begin(), pairs.end());
 
     for (size_t i = 0; i < pairs.size(); ++i)
     {
         size_t i0 = pairs[i].first, i1 = pairs[i].second;
-        process(src[i0], src[i1], corners[i0], corners[i1], masks[i0], masks[i1]);
+        Mat mask0 = masks[i0].getMat(ACCESS_RW), mask1 = masks[i1].getMat(ACCESS_RW);
+        process(src[i0].getMat(ACCESS_READ), src[i1].getMat(ACCESS_READ), corners[i0], corners[i1], mask0, mask1);
     }
 
     LOGLN("Finding seams, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
@@ -1055,7 +1060,7 @@ public:
 
     ~Impl() {}
 
-    void find(const std::vector<Mat> &src, const std::vector<Point> &corners, std::vector<Mat> &masks);
+    void find(const std::vector<UMat> &src, const std::vector<Point> &corners, std::vector<UMat> &masks);
     void findInPair(size_t first, size_t second, Rect roi);
 
 private:
@@ -1072,8 +1077,8 @@ private:
 };
 
 
-void GraphCutSeamFinder::Impl::find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                                    std::vector<Mat> &masks)
+void GraphCutSeamFinder::Impl::find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                                    std::vector<UMat> &masks)
 {
     // Compute gradients
     dx_.resize(src.size());
@@ -1207,10 +1212,10 @@ void GraphCutSeamFinder::Impl::setGraphWeightsColorGrad(
 
 void GraphCutSeamFinder::Impl::findInPair(size_t first, size_t second, Rect roi)
 {
-    Mat img1 = images_[first], img2 = images_[second];
+    Mat img1 = images_[first].getMat(ACCESS_READ), img2 = images_[second].getMat(ACCESS_READ);
     Mat dx1 = dx_[first], dx2 = dx_[second];
     Mat dy1 = dy_[first], dy2 = dy_[second];
-    Mat mask1 = masks_[first], mask2 = masks_[second];
+    Mat mask1 = masks_[first].getMat(ACCESS_RW), mask2 = masks_[second].getMat(ACCESS_RW);
     Point tl1 = corners_[first], tl2 = corners_[second];
 
     const int gap = 10;
@@ -1309,16 +1314,16 @@ GraphCutSeamFinder::GraphCutSeamFinder(int cost_type, float terminal_cost, float
 GraphCutSeamFinder::~GraphCutSeamFinder() {}
 
 
-void GraphCutSeamFinder::find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                              std::vector<Mat> &masks)
+void GraphCutSeamFinder::find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                              std::vector<UMat> &masks)
 {
     impl_->find(src, corners, masks);
 }
 
 
 #ifdef HAVE_OPENCV_CUDA
-void GraphCutSeamFinderGpu::find(const std::vector<Mat> &src, const std::vector<Point> &corners,
-                                 std::vector<Mat> &masks)
+void GraphCutSeamFinderGpu::find(const std::vector<UMat> &src, const std::vector<Point> &corners,
+                                 std::vector<UMat> &masks)
 {
     // Compute gradients
     dx_.resize(src.size());
@@ -1350,10 +1355,10 @@ void GraphCutSeamFinderGpu::find(const std::vector<Mat> &src, const std::vector<
 
 void GraphCutSeamFinderGpu::findInPair(size_t first, size_t second, Rect roi)
 {
-    Mat img1 = images_[first], img2 = images_[second];
+    Mat img1 = images_[first].getMat(ACCESS_READ), img2 = images_[second].getMat(ACCESS_READ);
     Mat dx1 = dx_[first], dx2 = dx_[second];
     Mat dy1 = dy_[first], dy2 = dy_[second];
-    Mat mask1 = masks_[first], mask2 = masks_[second];
+    Mat mask1 = masks_[first].getMat(ACCESS_READ), mask2 = masks_[second].getMat(ACCESS_READ);
     Point tl1 = corners_[first], tl2 = corners_[second];
 
     const int gap = 10;
diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp
index 5683ec308..ddfdb5084 100644
--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -86,15 +86,15 @@ Stitcher Stitcher::createDefault(bool try_use_gpu)
 }
 
 
-Stitcher::Status Stitcher::estimateTransform(InputArray images)
+Stitcher::Status Stitcher::estimateTransform(InputArrayOfArrays images)
 {
     return estimateTransform(images, std::vector<std::vector<Rect> >());
 }
 
 
-Stitcher::Status Stitcher::estimateTransform(InputArray images, const std::vector<std::vector<Rect> > &rois)
+Stitcher::Status Stitcher::estimateTransform(InputArrayOfArrays images, const std::vector<std::vector<Rect> > &rois)
 {
-    images.getMatVector(imgs_);
+    images.getUMatVector(imgs_);
     rois_ = rois;
 
     Status status;
@@ -112,21 +112,21 @@ Stitcher::Status Stitcher::estimateTransform(InputArray images, const std::vecto
 
 Stitcher::Status Stitcher::composePanorama(OutputArray pano)
 {
-    return composePanorama(std::vector<Mat>(), pano);
+    return composePanorama(std::vector<UMat>(), pano);
 }
 
 
-Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
+Stitcher::Status Stitcher::composePanorama(InputArrayOfArrays images, OutputArray pano)
 {
     LOGLN("Warping images (auxiliary)... ");
 
-    std::vector<Mat> imgs;
-    images.getMatVector(imgs);
+    std::vector<UMat> imgs;
+    images.getUMatVector(imgs);
     if (!imgs.empty())
     {
         CV_Assert(imgs.size() == imgs_.size());
 
-        Mat img;
+        UMat img;
         seam_est_imgs_.resize(imgs.size());
 
         for (size_t i = 0; i < imgs.size(); ++i)
@@ -136,8 +136,8 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
             seam_est_imgs_[i] = img.clone();
         }
 
-        std::vector<Mat> seam_est_imgs_subset;
-        std::vector<Mat> imgs_subset;
+        std::vector<UMat> seam_est_imgs_subset;
+        std::vector<UMat> imgs_subset;
 
         for (size_t i = 0; i < indices_.size(); ++i)
         {
@@ -149,17 +149,17 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
         imgs_ = imgs_subset;
     }
 
-    Mat &pano_ = pano.getMatRef();
+    UMat pano_;
 
 #if ENABLE_LOG
     int64 t = getTickCount();
 #endif
 
     std::vector<Point> corners(imgs_.size());
-    std::vector<Mat> masks_warped(imgs_.size());
-    std::vector<Mat> images_warped(imgs_.size());
+    std::vector<UMat> masks_warped(imgs_.size());
+    std::vector<UMat> images_warped(imgs_.size());
     std::vector<Size> sizes(imgs_.size());
-    std::vector<Mat> masks(imgs_.size());
+    std::vector<UMat> masks(imgs_.size());
 
     // Prepare image masks
     for (size_t i = 0; i < imgs_.size(); ++i)
@@ -179,13 +179,13 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
         K(1,1) *= (float)seam_work_aspect_;
         K(1,2) *= (float)seam_work_aspect_;
 
-        corners[i] = w->warp(seam_est_imgs_[i], K, cameras_[i].R, INTER_LINEAR, BORDER_REFLECT, images_warped[i]);
+        corners[i] = w->warp(seam_est_imgs_[i], K, cameras_[i].R, INTER_LINEAR, BORDER_CONSTANT, images_warped[i]);
         sizes[i] = images_warped[i].size();
 
         w->warp(masks[i], K, cameras_[i].R, INTER_NEAREST, BORDER_CONSTANT, masks_warped[i]);
     }
 
-    std::vector<Mat> images_warped_f(imgs_.size());
+    std::vector<UMat> images_warped_f(imgs_.size());
     for (size_t i = 0; i < imgs_.size(); ++i)
         images_warped[i].convertTo(images_warped_f[i], CV_32F);
 
@@ -206,8 +206,8 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
     t = getTickCount();
 #endif
 
-    Mat img_warped, img_warped_s;
-    Mat dilated_mask, seam_mask, mask, mask_warped;
+    UMat img_warped, img_warped_s;
+    UMat dilated_mask, seam_mask, mask, mask_warped;
 
     //double compose_seam_aspect = 1;
     double compose_work_aspect = 1;
@@ -216,10 +216,13 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
     double compose_scale = 1;
     bool is_compose_scale_set = false;
 
-    Mat full_img, img;
+    UMat full_img, img;
     for (size_t img_idx = 0; img_idx < imgs_.size(); ++img_idx)
     {
         LOGLN("Compositing image #" << indices_[img_idx] + 1);
+#if ENABLE_LOG
+        int64 compositing_t = getTickCount();
+#endif
 
         // Read image and resize it if necessary
         full_img = imgs_[img_idx];
@@ -261,25 +264,48 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
             }
         }
         if (std::abs(compose_scale - 1) > 1e-1)
+        {
+#if ENABLE_LOG
+            int64 resize_t = getTickCount();
+#endif
             resize(full_img, img, Size(), compose_scale, compose_scale);
+            LOGLN("  resize time: " << ((getTickCount() - resize_t) / getTickFrequency()) << " sec");
+        }
         else
             img = full_img;
         full_img.release();
         Size img_size = img.size();
 
+        LOGLN(" after resize time: " << ((getTickCount() - compositing_t) / getTickFrequency()) << " sec");
+
         Mat K;
         cameras_[img_idx].K().convertTo(K, CV_32F);
 
+#if ENABLE_LOG
+        int64 pt = getTickCount();
+#endif
         // Warp the current image
-        w->warp(img, K, cameras_[img_idx].R, INTER_LINEAR, BORDER_REFLECT, img_warped);
+        w->warp(img, K, cameras_[img_idx].R, INTER_LINEAR, BORDER_CONSTANT, img_warped);
+        LOGLN(" warp the current image: " << ((getTickCount() - pt) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+        pt = getTickCount();
+#endif
 
         // Warp the current image mask
         mask.create(img_size, CV_8U);
         mask.setTo(Scalar::all(255));
         w->warp(mask, K, cameras_[img_idx].R, INTER_NEAREST, BORDER_CONSTANT, mask_warped);
+        LOGLN(" warp the current image mask: " << ((getTickCount() - pt) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+        pt = getTickCount();
+#endif
 
         // Compensate exposure
         exposure_comp_->apply((int)img_idx, corners[img_idx], img_warped, mask_warped);
+        LOGLN(" compensate exposure: " << ((getTickCount() - pt) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+        pt = getTickCount();
+#endif
 
         img_warped.convertTo(img_warped_s, CV_16S);
         img_warped.release();
@@ -290,7 +316,12 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
         dilate(masks_warped[img_idx], dilated_mask, Mat());
         resize(dilated_mask, seam_mask, mask_warped.size());
 
-        mask_warped = seam_mask & mask_warped;
+        bitwise_and(seam_mask, mask_warped, mask_warped);
+
+        LOGLN(" other: " << ((getTickCount() - pt) / getTickFrequency()) << " sec");
+#if ENABLE_LOG
+        pt = getTickCount();
+#endif
 
         if (!is_blender_prepared)
         {
@@ -298,24 +329,36 @@ Stitcher::Status Stitcher::composePanorama(InputArray images, OutputArray pano)
             is_blender_prepared = true;
         }
 
+        LOGLN(" other2: " << ((getTickCount() - pt) / getTickFrequency()) << " sec");
+
+        LOGLN(" feed...");
+#if ENABLE_LOG
+        int64 feed_t = getTickCount();
+#endif
         // Blend the current image
         blender_->feed(img_warped_s, mask_warped, corners[img_idx]);
+        LOGLN(" feed time: " << ((getTickCount() - feed_t) / getTickFrequency()) << " sec");
+        LOGLN("Compositing ## time: " << ((getTickCount() - compositing_t) / getTickFrequency()) << " sec");
     }
 
-    Mat result, result_mask;
+#if ENABLE_LOG
+        int64 blend_t = getTickCount();
+#endif
+    UMat result, result_mask;
     blender_->blend(result, result_mask);
+    LOGLN("blend time: " << ((getTickCount() - blend_t) / getTickFrequency()) << " sec");
 
     LOGLN("Compositing, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
 
     // Preliminary result is in CV_16SC3 format, but all values are in [0,255] range,
     // so convert it to avoid user confusing
-    result.convertTo(pano_, CV_8U);
+    result.convertTo(pano, CV_8U);
 
     return OK;
 }
 
 
-Stitcher::Status Stitcher::stitch(InputArray images, OutputArray pano)
+Stitcher::Status Stitcher::stitch(InputArrayOfArrays images, OutputArray pano)
 {
     Status status = estimateTransform(images);
     if (status != OK)
@@ -324,7 +367,7 @@ Stitcher::Status Stitcher::stitch(InputArray images, OutputArray pano)
 }
 
 
-Stitcher::Status Stitcher::stitch(InputArray images, const std::vector<std::vector<Rect> > &rois, OutputArray pano)
+Stitcher::Status Stitcher::stitch(InputArrayOfArrays images, const std::vector<std::vector<Rect> > &rois, OutputArray pano)
 {
     Status status = estimateTransform(images, rois);
     if (status != OK)
@@ -346,7 +389,7 @@ Stitcher::Status Stitcher::matchImages()
     seam_scale_ = 1;
     bool is_work_scale_set = false;
     bool is_seam_scale_set = false;
-    Mat full_img, img;
+    UMat full_img, img;
     features_.resize(imgs_.size());
     seam_est_imgs_.resize(imgs_.size());
     full_img_sizes_.resize(imgs_.size());
@@ -420,8 +463,8 @@ Stitcher::Status Stitcher::matchImages()
 
     // Leave only images we are sure are from the same panorama
     indices_ = detail::leaveBiggestComponent(features_, pairwise_matches_, (float)conf_thresh_);
-    std::vector<Mat> seam_est_imgs_subset;
-    std::vector<Mat> imgs_subset;
+    std::vector<UMat> seam_est_imgs_subset;
+    std::vector<UMat> imgs_subset;
     std::vector<Size> full_img_sizes_subset;
     for (size_t i = 0; i < indices_.size(); ++i)
     {
@@ -454,7 +497,7 @@ Stitcher::Status Stitcher::estimateCameraParams()
         Mat R;
         cameras_[i].R.convertTo(R, CV_32F);
         cameras_[i].R = R;
-        LOGLN("Initial intrinsic parameters #" << indices_[i] + 1 << ":\n " << cameras_[i].K());
+        //LOGLN("Initial intrinsic parameters #" << indices_[i] + 1 << ":\n " << cameras_[i].K());
     }
 
     bundle_adjuster_->setConfThresh(conf_thresh_);
@@ -465,7 +508,7 @@ Stitcher::Status Stitcher::estimateCameraParams()
     std::vector<double> focals;
     for (size_t i = 0; i < cameras_.size(); ++i)
     {
-        LOGLN("Camera #" << indices_[i] + 1 << ":\n" << cameras_[i].K());
+        //LOGLN("Camera #" << indices_[i] + 1 << ":\n" << cameras_[i].K());
         focals.push_back(cameras_[i].focal);
     }
 
diff --git a/modules/stitching/src/util.cpp b/modules/stitching/src/util.cpp
index f6abf9e48..5e026f098 100644
--- a/modules/stitching/src/util.cpp
+++ b/modules/stitching/src/util.cpp
@@ -113,7 +113,7 @@ bool overlapRoi(Point tl1, Point tl2, Size sz1, Size sz2, Rect &roi)
 }
 
 
-Rect resultRoi(const std::vector<Point> &corners, const std::vector<Mat> &images)
+Rect resultRoi(const std::vector<Point> &corners, const std::vector<UMat> &images)
 {
     std::vector<Size> sizes(images.size());
     for (size_t i = 0; i < images.size(); ++i)
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index eb15d44c0..b6d1f8a8a 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 namespace cv {
 namespace detail {
@@ -86,7 +87,6 @@ Point2f PlaneWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R, In
     return uv;
 }
 
-
 Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray _xmap, OutputArray _ymap)
 {
     projector_.setCameraParams(K, R, T);
@@ -94,8 +94,29 @@ Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArra
     Point dst_tl, dst_br;
     detectResultRoi(src_size, dst_tl, dst_br);
 
-    _xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
-    _ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+    _xmap.create(dsize, CV_32FC1);
+    _ymap.create(dsize, CV_32FC1);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpPlaneMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+
+            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv), t(1, 3, CV_32FC1, projector_.t);
+            UMat uxmap = _xmap.getUMat(), uymap = _ymap.getUMat(),
+                    uk_rinv = k_rinv.getUMat(ACCESS_READ), ut = t.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(uk_rinv), ocl::KernelArg::PtrReadOnly(ut),
+                   dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
 
     Mat xmap = _xmap.getMat(), ymap = _ymap.getMat();
 
@@ -117,11 +138,11 @@ Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArra
 Point PlaneWarper::warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
                         OutputArray dst)
 {
-    Mat xmap, ymap;
-    Rect dst_roi = buildMaps(src.size(), K, R, T, xmap, ymap);
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, T, uxmap, uymap);
 
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    remap(src, dst, xmap, ymap, interp_mode, border_mode);
+    remap(src, dst, uxmap, uymap, interp_mode, border_mode);
 
     return dst_roi.tl();
 }
@@ -341,5 +362,93 @@ void SphericalPortraitWarper::detectResultRoi(Size src_size, Point &dst_tl, Poin
     dst_br.y = static_cast<int>(br_vf);
 }
 
+/////////////////////////////////////////// SphericalWarper ////////////////////////////////////////
+
+Rect SphericalWarper::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+{
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpSphericalMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            projector_.setCameraParams(K, R);
+
+            Point dst_tl, dst_br;
+            detectResultRoi(src_size, dst_tl, dst_br);
+
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return RotationWarperBase<SphericalProjector>::buildMaps(src_size, K, R, xmap, ymap);
+}
+
+Point SphericalWarper::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    remap(src, dst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+/////////////////////////////////////////// CylindricalWarper ////////////////////////////////////////
+
+Rect CylindricalWarper::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+{
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpCylindricalMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            projector_.setCameraParams(K, R);
+
+            Point dst_tl, dst_br;
+            detectResultRoi(src_size, dst_tl, dst_br);
+
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return RotationWarperBase<CylindricalProjector>::buildMaps(src_size, K, R, xmap, ymap);
+}
+
+Point CylindricalWarper::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    remap(src, dst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
 } // namespace detail
 } // namespace cv
diff --git a/modules/stitching/src/warpers_ocl.cpp b/modules/stitching/src/warpers_ocl.cpp
deleted file mode 100644
index ef8f31677..000000000
--- a/modules/stitching/src/warpers_ocl.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-namespace cv {
-namespace detail {
-
-/////////////////////////////////////////// PlaneWarperOcl ////////////////////////////////////////////
-
-Rect PlaneWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap)
-{
-    projector_.setCameraParams(K, R, T);
-
-    Point dst_tl, dst_br;
-    detectResultRoi(src_size, dst_tl, dst_br);
-
-    if (ocl::useOpenCL())
-    {
-        ocl::Kernel k("buildWarpPlaneMaps", ocl::stitching::warpers_oclsrc);
-        if (!k.empty())
-        {
-            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
-            xmap.create(dsize, CV_32FC1);
-            ymap.create(dsize, CV_32FC1);
-
-            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv), t(1, 3, CV_32FC1, projector_.t);
-            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(),
-                    uk_rinv = k_rinv.getUMat(ACCESS_READ), ut = t.getUMat(ACCESS_READ);
-
-            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
-                   ocl::KernelArg::PtrReadOnly(uk_rinv), ocl::KernelArg::PtrReadOnly(ut),
-                   dst_tl.x, dst_tl.y, projector_.scale);
-
-            size_t globalsize[2] = { dsize.width, dsize.height };
-            if (k.run(2, globalsize, NULL, true))
-                return Rect(dst_tl, dst_br);
-        }
-    }
-
-    return PlaneWarper::buildMaps(src_size, K, R, T, xmap, ymap);
-}
-
-Point PlaneWarperOcl::warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, OutputArray dst)
-{
-    UMat uxmap, uymap;
-    Rect dst_roi = buildMaps(src.size(), K, R, T, uxmap, uymap);
-
-    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    UMat udst = dst.getUMat();
-    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
-
-    return dst_roi.tl();
-}
-
-/////////////////////////////////////////// SphericalWarperOcl ////////////////////////////////////////
-
-Rect SphericalWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
-{
-    projector_.setCameraParams(K, R);
-
-    Point dst_tl, dst_br;
-    detectResultRoi(src_size, dst_tl, dst_br);
-
-    if (ocl::useOpenCL())
-    {
-        ocl::Kernel k("buildWarpSphericalMaps", ocl::stitching::warpers_oclsrc);
-        if (!k.empty())
-        {
-            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
-            xmap.create(dsize, CV_32FC1);
-            ymap.create(dsize, CV_32FC1);
-
-            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv);
-            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
-
-            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
-                   ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale);
-
-            size_t globalsize[2] = { dsize.width, dsize.height };
-            if (k.run(2, globalsize, NULL, true))
-                return Rect(dst_tl, dst_br);
-        }
-    }
-
-    return SphericalWarper::buildMaps(src_size, K, R, xmap, ymap);
-}
-
-Point SphericalWarperOcl::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
-{
-    UMat uxmap, uymap;
-    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
-
-    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    UMat udst = dst.getUMat();
-    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
-
-    return dst_roi.tl();
-}
-
-/////////////////////////////////////////// CylindricalWarperOcl ////////////////////////////////////////
-
-Rect CylindricalWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
-{
-    projector_.setCameraParams(K, R);
-
-    Point dst_tl, dst_br;
-    detectResultRoi(src_size, dst_tl, dst_br);
-
-    if (ocl::useOpenCL())
-    {
-        ocl::Kernel k("buildWarpCylindricalMaps", ocl::stitching::warpers_oclsrc);
-        if (!k.empty())
-        {
-            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
-            xmap.create(dsize, CV_32FC1);
-            ymap.create(dsize, CV_32FC1);
-
-            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv);
-            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
-
-            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
-                   ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale);
-
-            size_t globalsize[2] = { dsize.width, dsize.height };
-            if (k.run(2, globalsize, NULL, true))
-                return Rect(dst_tl, dst_br);
-        }
-    }
-
-    return CylindricalWarper::buildMaps(src_size, K, R, xmap, ymap);
-}
-
-Point CylindricalWarperOcl::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
-{
-    UMat uxmap, uymap;
-    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
-
-    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
-    UMat udst = dst.getUMat();
-    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
-
-    return dst_roi.tl();
-}
-
-} // namespace detail
-} // namespace cv
diff --git a/modules/stitching/test/ocl/test_warpers.cpp b/modules/stitching/test/ocl/test_warpers.cpp
index 94050e966..43f0e9741 100644
--- a/modules/stitching/test/ocl/test_warpers.cpp
+++ b/modules/stitching/test/ocl/test_warpers.cpp
@@ -48,13 +48,11 @@
 namespace cvtest {
 namespace ocl {
 
-///////////////////////// WarperTestBase ///////////////////////////
-
 struct WarperTestBase :
         public Test, public TestUtils
 {
     Mat src, dst, xmap, ymap;
-    Mat udst, uxmap, uymap;
+    UMat usrc, udst, uxmap, uymap;
     Mat K, R;
 
     virtual void generateTestData()
@@ -62,6 +60,7 @@ struct WarperTestBase :
         Size size = randomSize(1, MAX_VALUE);
 
         src = randomMat(size, CV_32FC1, -500, 500);
+        src.copyTo(usrc);
 
         K = Mat::eye(3, 3, CV_32FC1);
         float angle = (float)(30.0 * CV_PI / 180.0);
@@ -81,70 +80,64 @@ struct WarperTestBase :
     }
 };
 
-//////////////////////////////// SphericalWarperOcl /////////////////////////////////////////////////
+typedef WarperTestBase SphericalWarperTest;
 
-typedef WarperTestBase SphericalWarperOclTest;
-
-OCL_TEST_F(SphericalWarperOclTest, Mat)
+OCL_TEST_F(SphericalWarperTest, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
         generateTestData();
 
-        Ptr<WarperCreator> creator = makePtr<SphericalWarperOcl>();
+        Ptr<WarperCreator> creator = makePtr<SphericalWarper>();
         Ptr<detail::RotationWarper> warper = creator->create(2.0);
 
         OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
-        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+        OCL_ON(warper->buildMaps(usrc.size(), K, R, uxmap, uymap));
 
         OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
-        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
 
         Near(1e-4);
     }
 }
 
-//////////////////////////////// CylindricalWarperOcl /////////////////////////////////////////////////
+typedef WarperTestBase CylindricalWarperTest;
 
-typedef WarperTestBase CylindricalWarperOclTest;
-
-OCL_TEST_F(CylindricalWarperOclTest, Mat)
+OCL_TEST_F(CylindricalWarperTest, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
         generateTestData();
 
-        Ptr<WarperCreator> creator = makePtr<CylindricalWarperOcl>();
+        Ptr<WarperCreator> creator = makePtr<CylindricalWarper>();
         Ptr<detail::RotationWarper> warper = creator->create(2.0);
 
         OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
-        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+        OCL_ON(warper->buildMaps(usrc.size(), K, R, uxmap, uymap));
 
         OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
-        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
 
         Near(1e-4);
     }
 }
 
-//////////////////////////////// PlaneWarperOcl /////////////////////////////////////////////////
+typedef WarperTestBase PlaneWarperTest;
 
-typedef WarperTestBase PlaneWarperOclTest;
-
-OCL_TEST_F(PlaneWarperOclTest, Mat)
+OCL_TEST_F(PlaneWarperTest, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
         generateTestData();
 
-        Ptr<WarperCreator> creator = makePtr<PlaneWarperOcl>();
+        Ptr<WarperCreator> creator = makePtr<PlaneWarper>();
         Ptr<detail::RotationWarper> warper = creator->create(2.0);
 
         OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
-        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+        OCL_ON(warper->buildMaps(usrc.size(), K, R, uxmap, uymap));
 
         OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
-        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+        OCL_ON(warper->warp(usrc, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
 
         Near(1e-4);
     }
diff --git a/modules/stitching/test/test_blenders.cpp b/modules/stitching/test/test_blenders.cpp
index cb84482f2..f7c7835de 100644
--- a/modules/stitching/test/test_blenders.cpp
+++ b/modules/stitching/test/test_blenders.cpp
@@ -73,6 +73,6 @@ TEST(MultiBandBlender, CanBlendTwoImages)
     Mat result; result_s.convertTo(result, CV_8U);
 
     Mat expected = imread(string(cvtest::TS::ptr()->get_data_path()) + "stitching/baboon_lena.png");
-    double rmsErr = cvtest::norm(expected, result, NORM_L2) / sqrt(double(expected.size().area()));
-    ASSERT_LT(rmsErr, 1e-3);
+    double psnr = cvtest::PSNR(expected, result);
+    EXPECT_GE(psnr, 50);
 }
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index 5dd25dba4..43c01b2da 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -57,6 +57,31 @@ namespace ocl {
 using namespace cv;
 using namespace testing;
 
+inline std::vector<UMat> ToUMat(const std::vector<Mat>& src)
+{
+    std::vector<UMat> dst;
+    dst.resize(src.size());
+    for (size_t i = 0; i < src.size(); ++i)
+    {
+        src[i].copyTo(dst[i]);
+    }
+    return dst;
+}
+
+inline UMat ToUMat(const Mat& src)
+{
+    UMat dst;
+    src.copyTo(dst);
+    return dst;
+}
+
+inline UMat ToUMat(InputArray src)
+{
+    UMat dst;
+    src.getMat().copyTo(dst);
+    return dst;
+}
+
 extern int test_loop_times;
 
 #define MAX_VALUE 357
diff --git a/modules/video/doc/motion_analysis_and_object_tracking.rst b/modules/video/doc/motion_analysis_and_object_tracking.rst
index b392c6854..7d5d1d5be 100644
--- a/modules/video/doc/motion_analysis_and_object_tracking.rst
+++ b/modules/video/doc/motion_analysis_and_object_tracking.rst
@@ -596,7 +596,7 @@ Returns the number of gaussian components in the background model
 
 BackgroundSubtractorMOG2::setNMixtures
 --------------------------------------
-Sets the number of gaussian components in the background model
+Sets the number of gaussian components in the background model. The model needs to be reinitalized to reserve memory.
 
 .. ocv:function:: void BackgroundSubtractorMOG2::setNMixtures(int nmixtures)
 
@@ -615,9 +615,23 @@ Sets the "background ratio" parameter of the algorithm
 
 .. ocv:function:: void BackgroundSubtractorMOG2::setBackgroundRatio(double ratio)
 
+BackgroundSubtractorMOG2::getVarThreshold
+---------------------------------------------
+Returns the variance threshold for the pixel-model match
+
+.. ocv:function:: double BackgroundSubtractorMOG2::getVarThreshold() const
+
+The main threshold on the squared Mahalanobis distance to decide if the sample is well described by the background model or not. Related to Cthr from the paper.
+
+BackgroundSubtractorMOG2::setVarThreshold
+---------------------------------------------
+Sets the variance threshold for the pixel-model match
+
+.. ocv:function:: void BackgroundSubtractorMOG2::setVarThreshold(double varThreshold)
+
 BackgroundSubtractorMOG2::getVarThresholdGen
 ---------------------------------------------
-Returns the variance scale factor for the pixel-model match
+Returns the variance threshold for the pixel-model match used for new mixture component generation
 
 .. ocv:function:: double BackgroundSubtractorMOG2::getVarThresholdGen() const
 
@@ -625,7 +639,7 @@ Threshold for the squared Mahalanobis distance that helps decide when a sample i
 
 BackgroundSubtractorMOG2::setVarThresholdGen
 ---------------------------------------------
-Sets the variance scale factor for the pixel-model match
+Sets the variance threshold for the pixel-model match used for new mixture component generation
 
 .. ocv:function:: void BackgroundSubtractorMOG2::setVarThresholdGen(double varThresholdGen)
 
@@ -700,6 +714,126 @@ Sets the shadow threshold
 .. ocv:function:: void BackgroundSubtractorMOG2::setShadowThreshold(double threshold)
 
 
+BackgroundSubtractorKNN
+------------------------
+K-nearest neigbours - based Background/Foreground Segmentation Algorithm.
+
+.. ocv:class:: BackgroundSubtractorKNN : public BackgroundSubtractor
+
+The class implements the K-nearest neigbours background subtraction described in [Zivkovic2006]_ . Very efficient if number of foreground pixels is low.
+
+
+createBackgroundSubtractorKNN
+--------------------------------------------------
+Creates KNN Background Subtractor
+
+.. ocv:function:: Ptr<BackgroundSubtractorKNN> createBackgroundSubtractorKNN( int history=500, double dist2Threshold=400.0, bool detectShadows=true )
+
+  :param history: Length of the history.
+
+  :param dist2Threshold: Threshold on the squared distance between the pixel and the sample to decide whether a pixel is close to that sample. This parameter does not affect the background update.
+
+  :param detectShadows: If true, the algorithm will detect shadows and mark them. It decreases the speed a bit, so if you do not need this feature, set the parameter to false.
+
+
+BackgroundSubtractorKNN::getHistory
+--------------------------------------
+Returns the number of last frames that affect the background model
+
+.. ocv:function:: int BackgroundSubtractorKNN::getHistory() const
+
+
+BackgroundSubtractorKNN::setHistory
+--------------------------------------
+Sets the number of last frames that affect the background model
+
+.. ocv:function:: void BackgroundSubtractorKNN::setHistory(int history)
+
+
+BackgroundSubtractorKNN::getNSamples
+--------------------------------------
+Returns the number of data samples in the background model
+
+.. ocv:function:: int BackgroundSubtractorKNN::getNSamples() const
+
+
+BackgroundSubtractorKNN::setNSamples
+--------------------------------------
+Sets the number of data samples in the background model. The model needs to be reinitalized to reserve memory.
+
+.. ocv:function:: void BackgroundSubtractorKNN::setNSamples(int _nN)
+
+
+BackgroundSubtractorKNN::getDist2Threshold
+---------------------------------------------
+Returns the threshold on the squared distance between the pixel and the sample
+
+.. ocv:function:: double BackgroundSubtractorKNN::getDist2Threshold() const
+
+The threshold on the squared distance between the pixel and the sample to decide whether a pixel is close to a data sample.
+
+BackgroundSubtractorKNN::setDist2Threshold
+---------------------------------------------
+Sets the threshold on the squared distance
+
+.. ocv:function:: void BackgroundSubtractorKNN::setDist2Threshold(double _dist2Threshold)
+
+BackgroundSubtractorKNN::getkNNSamples
+---------------------------------------------
+Returns the number of neighbours, the k in the kNN. K is the number of samples that need to be within dist2Threshold in order to decide that that pixel is matching the kNN background model.
+
+.. ocv:function:: int BackgroundSubtractorKNN::getkNNSamples() const
+
+BackgroundSubtractorKNN::setkNNSamples
+---------------------------------------------
+Sets the k in the kNN. How many nearest neigbours need to match.
+
+.. ocv:function:: void BackgroundSubtractorKNN::setkNNSamples(int _nkNN)
+
+
+BackgroundSubtractorKNN::getDetectShadows
+---------------------------------------------
+Returns the shadow detection flag
+
+.. ocv:function:: bool BackgroundSubtractorKNN::getDetectShadows() const
+
+If true, the algorithm detects shadows and marks them. See createBackgroundSubtractorKNN for details.
+
+BackgroundSubtractorKNN::setDetectShadows
+---------------------------------------------
+Enables or disables shadow detection
+
+.. ocv:function:: void BackgroundSubtractorKNN::setDetectShadows(bool detectShadows)
+
+BackgroundSubtractorKNN::getShadowValue
+---------------------------------------------
+Returns the shadow value
+
+.. ocv:function:: int BackgroundSubtractorKNN::getShadowValue() const
+
+Shadow value is the value used to mark shadows in the foreground mask. Default value is 127. Value 0 in the mask always means background, 255 means foreground.
+
+BackgroundSubtractorKNN::setShadowValue
+---------------------------------------------
+Sets the shadow value
+
+.. ocv:function:: void BackgroundSubtractorKNN::setShadowValue(int value)
+
+BackgroundSubtractorKNN::getShadowThreshold
+---------------------------------------------
+Returns the shadow threshold
+
+.. ocv:function:: double BackgroundSubtractorKNN::getShadowThreshold() const
+
+A shadow is detected if pixel is a darker version of the background. The shadow threshold (``Tau`` in the paper) is a threshold defining how much darker the shadow can be. ``Tau= 0.5`` means that if a pixel is more than twice darker then it is not shadow. See Prati, Mikic, Trivedi and Cucchiarra, *Detecting Moving Shadows...*, IEEE PAMI,2003.
+
+BackgroundSubtractorKNN::setShadowThreshold
+---------------------------------------------
+Sets the shadow threshold
+
+.. ocv:function:: void BackgroundSubtractorKNN::setShadowThreshold(double threshold)
+
+
 BackgroundSubtractorGMG
 ------------------------
 Background Subtractor module based on the algorithm given in [Gold2012]_.
@@ -974,9 +1108,9 @@ Releases all inner buffers.
 
 .. [Bradski98] Bradski, G.R. "Computer Vision Face Tracking for Use in a Perceptual User Interface", Intel, 1998
 
-.. [Bradski00] Davis, J.W. and Bradski, G.R. “Motion Segmentation and Pose Recognition with Motion History Gradients”, WACV00, 2000
+.. [Bradski00] Davis, J.W. and Bradski, G.R. "Motion Segmentation and Pose Recognition with Motion History Gradients", WACV00, 2000
 
-.. [Davis97] Davis, J.W. and Bobick, A.F. “The Representation and Recognition of Action Using Temporal Templates”, CVPR97, 1997
+.. [Davis97] Davis, J.W. and Bobick, A.F. "The Representation and Recognition of Action Using Temporal Templates", CVPR97, 1997
 
 .. [EP08] Evangelidis, G.D. and Psarakis E.Z. "Parametric Image Alignment using Enhanced Correlation Coefficient Maximization", IEEE Transactions on PAMI, vol. 32, no. 10, 2008
 
@@ -990,7 +1124,7 @@ Releases all inner buffers.
 
 .. [Lucas81] Lucas, B., and Kanade, T. An Iterative Image Registration Technique with an Application to Stereo Vision, Proc. of 7th International Joint Conference on Artificial Intelligence (IJCAI), pp. 674-679.
 
-.. [Welch95] Greg Welch and Gary Bishop “An Introduction to the Kalman Filter”, 1995
+.. [Welch95] Greg Welch and Gary Bishop "An Introduction to the Kalman Filter", 1995
 
 .. [Tao2012] Michael Tao, Jiamin Bai, Pushmeet Kohli and Sylvain Paris. SimpleFlow: A Non-iterative, Sublinear Optical Flow Algorithm. Computer Graphics Forum (Eurographics 2012)
 
diff --git a/modules/video/include/opencv2/video/background_segm.hpp b/modules/video/include/opencv2/video/background_segm.hpp
index a46dc820e..73988c806 100644
--- a/modules/video/include/opencv2/video/background_segm.hpp
+++ b/modules/video/include/opencv2/video/background_segm.hpp
@@ -113,7 +113,7 @@ public:
     CV_WRAP virtual void setHistory(int history) = 0;
 
     CV_WRAP virtual int getNMixtures() const = 0;
-    CV_WRAP virtual void setNMixtures(int nmixtures) = 0;
+    CV_WRAP virtual void setNMixtures(int nmixtures) = 0;//needs reinitialization!
 
     CV_WRAP virtual double getBackgroundRatio() const = 0;
     CV_WRAP virtual void setBackgroundRatio(double ratio) = 0;
@@ -150,6 +150,45 @@ CV_EXPORTS_W Ptr<BackgroundSubtractorMOG2>
     createBackgroundSubtractorMOG2(int history=500, double varThreshold=16,
                                    bool detectShadows=true);
 
+/*!
+ The class implements the K nearest neigbours algorithm from:
+ "Efficient Adaptive Density Estimation per Image Pixel for the Task of Background Subtraction"
+ Z.Zivkovic, F. van der Heijden
+ Pattern Recognition Letters, vol. 27, no. 7, pages 773-780, 2006
+ http://www.zoranz.net/Publications/zivkovicPRL2006.pdf
+
+ Fast for small foreground object. Results on the benchmark data is at http://www.changedetection.net.
+*/
+
+class CV_EXPORTS_W BackgroundSubtractorKNN : public BackgroundSubtractor
+{
+public:
+    CV_WRAP virtual int getHistory() const = 0;
+    CV_WRAP virtual void setHistory(int history) = 0;
+
+    CV_WRAP virtual int getNSamples() const = 0;
+    CV_WRAP virtual void setNSamples(int _nN) = 0;//needs reinitialization!
+
+    CV_WRAP virtual double getDist2Threshold() const = 0;
+    CV_WRAP virtual void setDist2Threshold(double _dist2Threshold) = 0;
+
+    CV_WRAP virtual int getkNNSamples() const = 0;
+    CV_WRAP virtual void setkNNSamples(int _nkNN) = 0;
+
+    CV_WRAP virtual bool getDetectShadows() const = 0;
+    CV_WRAP virtual void setDetectShadows(bool detectShadows) = 0;
+
+    CV_WRAP virtual int getShadowValue() const = 0;
+    CV_WRAP virtual void setShadowValue(int value) = 0;
+
+    CV_WRAP virtual double getShadowThreshold() const = 0;
+    CV_WRAP virtual void setShadowThreshold(double threshold) = 0;
+};
+
+CV_EXPORTS_W Ptr<BackgroundSubtractorKNN>
+    createBackgroundSubtractorKNN(int history=500, double dist2Threshold=400.0,
+                                   bool detectShadows=true);
+
 /**
  * Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1)
  * images of the same size, where 255 indicates Foreground and 0 represents Background.
diff --git a/modules/video/src/bgfg_KNN.cpp b/modules/video/src/bgfg_KNN.cpp
new file mode 100755
index 000000000..63ef300e0
--- /dev/null
+++ b/modules/video/src/bgfg_KNN.cpp
@@ -0,0 +1,654 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+//#include <math.h>
+
+#include "precomp.hpp"
+
+namespace cv
+{
+
+/*!
+ The class implements the following algorithm:
+ "Efficient Adaptive Density Estimation per Image Pixel for the Task of Background Subtraction"
+ Z.Zivkovic, F. van der Heijden
+ Pattern Recognition Letters, vol. 27, no. 7, pages 773-780, 2006
+ http://www.zoranz.net/Publications/zivkovicPRL2006.pdf
+*/
+
+// default parameters of gaussian background detection algorithm
+static const int defaultHistory2 = 500; // Learning rate; alpha = 1/defaultHistory2
+static const int defaultNsamples = 7; // number of samples saved in memory
+static const float defaultDist2Threshold = 20.0f*20.0f;//threshold on distance from the sample
+
+// additional parameters
+static const unsigned char defaultnShadowDetection2 = (unsigned char)127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
+static const float defaultfTau = 0.5f; // Tau - shadow threshold, see the paper for explanation
+
+class BackgroundSubtractorKNNImpl : public BackgroundSubtractorKNN
+{
+public:
+    //! the default constructor
+    BackgroundSubtractorKNNImpl()
+    {
+    frameSize = Size(0,0);
+    frameType = 0;
+    nframes = 0;
+    history = defaultHistory2;
+
+    //set parameters
+    // N - the number of samples stored in memory per model
+    nN = defaultNsamples;
+
+    //kNN - k nearest neighbour - number on NN for detecting background - default K=[0.1*nN]
+    nkNN=MAX(1,cvRound(0.1*nN*3+0.40));
+
+    //Tb - Threshold Tb*kernelwidth
+    fTb = defaultDist2Threshold;
+
+    // Shadow detection
+    bShadowDetection = 1;//turn on
+    nShadowDetection =  defaultnShadowDetection2;
+    fTau = defaultfTau;// Tau - shadow threshold
+    name_ = "BackgroundSubtractor.KNN";
+    }
+    //! the full constructor that takes the length of the history,
+    // the number of gaussian mixtures, the background ratio parameter and the noise strength
+    BackgroundSubtractorKNNImpl(int _history,  float _dist2Threshold, bool _bShadowDetection=true)
+    {
+    frameSize = Size(0,0);
+    frameType = 0;
+
+    nframes = 0;
+    history = _history > 0 ? _history : defaultHistory2;
+
+    //set parameters
+    // N - the number of samples stored in memory per model
+    nN = defaultNsamples;
+    //kNN - k nearest neighbour - number on NN for detcting background - default K=[0.1*nN]
+    nkNN=MAX(1,cvRound(0.1*nN*3+0.40));
+
+    //Tb - Threshold Tb*kernelwidth
+    fTb = _dist2Threshold>0? _dist2Threshold : defaultDist2Threshold;
+
+    bShadowDetection = _bShadowDetection;
+    nShadowDetection =  defaultnShadowDetection2;
+    fTau = defaultfTau;
+    name_ = "BackgroundSubtractor.KNN";
+    }
+    //! the destructor
+    ~BackgroundSubtractorKNNImpl() {}
+    //! the update operator
+    void apply(InputArray image, OutputArray fgmask, double learningRate=-1);
+
+    //! computes a background image which are the mean of all background gaussians
+    virtual void getBackgroundImage(OutputArray backgroundImage) const;
+
+    //! re-initiaization method
+    void initialize(Size _frameSize, int _frameType)
+    {
+    frameSize = _frameSize;
+    frameType = _frameType;
+    nframes = 0;
+
+    int nchannels = CV_MAT_CN(frameType);
+    CV_Assert( nchannels <= CV_CN_MAX );
+
+    // Reserve memory for the model
+    int size=frameSize.height*frameSize.width;
+    // for each sample of 3 speed pixel models each pixel bg model we store ...
+    // values + flag (nchannels+1 values)
+    bgmodel.create( 1,(nN * 3) * (nchannels+1)* size,CV_8U);
+
+    //index through the three circular lists
+    aModelIndexShort.create(1,size,CV_8U);
+    aModelIndexMid.create(1,size,CV_8U);
+    aModelIndexLong.create(1,size,CV_8U);
+    //when to update next
+    nNextShortUpdate.create(1,size,CV_8U);
+    nNextMidUpdate.create(1,size,CV_8U);
+    nNextLongUpdate.create(1,size,CV_8U);
+
+    //Reset counters
+    nShortCounter = 0;
+    nMidCounter = 0;
+    nLongCounter = 0;
+
+    aModelIndexShort = Scalar::all(0);//random? //((m_nN)*rand())/(RAND_MAX+1);//0...m_nN-1
+    aModelIndexMid = Scalar::all(0);
+    aModelIndexLong = Scalar::all(0);
+    nNextShortUpdate = Scalar::all(0);
+    nNextMidUpdate = Scalar::all(0);
+    nNextLongUpdate = Scalar::all(0);
+    }
+
+    virtual AlgorithmInfo* info() const { return 0; }
+
+    virtual int getHistory() const { return history; }
+    virtual void setHistory(int _nframes) { history = _nframes; }
+
+    virtual int getNSamples() const { return nN; }
+    virtual void setNSamples(int _nN) { nN = _nN; }//needs reinitialization!
+
+    virtual int getkNNSamples() const { return nkNN; }
+    virtual void setkNNSamples(int _nkNN) { nkNN = _nkNN; }
+
+    virtual double getDist2Threshold() const { return fTb; }
+    virtual void setDist2Threshold(double _dist2Threshold) { fTb = (float)_dist2Threshold; }
+
+    virtual bool getDetectShadows() const { return bShadowDetection; }
+    virtual void setDetectShadows(bool detectshadows) { bShadowDetection = detectshadows; }
+
+    virtual int getShadowValue() const { return nShadowDetection; }
+    virtual void setShadowValue(int value) { nShadowDetection = (uchar)value; }
+
+    virtual double getShadowThreshold() const { return fTau; }
+    virtual void setShadowThreshold(double value) { fTau = (float)value; }
+
+    virtual void write(FileStorage& fs) const
+    {
+        fs << "name" << name_
+        << "history" << history
+        << "nsamples" << nN
+        << "nKNN" << nkNN
+        << "dist2Threshold" << fTb
+        << "detectShadows" << (int)bShadowDetection
+        << "shadowValue" << (int)nShadowDetection
+        << "shadowThreshold" << fTau;
+    }
+
+    virtual void read(const FileNode& fn)
+    {
+        CV_Assert( (String)fn["name"] == name_ );
+        history = (int)fn["history"];
+        nN = (int)fn["nsamples"];
+        nkNN = (int)fn["nKNN"];
+        fTb = (float)fn["dist2Threshold"];
+        bShadowDetection = (int)fn["detectShadows"] != 0;
+        nShadowDetection = saturate_cast<uchar>((int)fn["shadowValue"]);
+        fTau = (float)fn["shadowThreshold"];
+    }
+
+protected:
+    Size frameSize;
+    int frameType;
+    int nframes;
+    /////////////////////////
+    //very important parameters - things you will change
+    ////////////////////////
+    int history;
+    //alpha=1/history - speed of update - if the time interval you want to average over is T
+    //set alpha=1/history. It is also usefull at start to make T slowly increase
+    //from 1 until the desired T
+    float fTb;
+    //Tb - threshold on the squared distance from the sample used to decide if it is well described
+    //by the background model or not. A typical value could be 2 sigma
+    //and that is Tb=2*2*10*10 =400; where we take typical pixel level sigma=10
+
+    /////////////////////////
+    //less important parameters - things you might change but be carefull
+    ////////////////////////
+    int nN;//totlal number of samples
+    int nkNN;//number on NN for detcting background - default K=[0.1*nN]
+
+    //shadow detection parameters
+    bool bShadowDetection;//default 1 - do shadow detection
+    unsigned char nShadowDetection;//do shadow detection - insert this value as the detection result - 127 default value
+    float fTau;
+    // Tau - shadow threshold. The shadow is detected if the pixel is darker
+    //version of the background. Tau is a threshold on how much darker the shadow can be.
+    //Tau= 0.5 means that if pixel is more than 2 times darker then it is not shadow
+    //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003.
+
+    //model data
+    int nLongCounter;//circular counter
+    int nMidCounter;
+    int nShortCounter;
+    Mat bgmodel; // model data pixel values
+    Mat aModelIndexShort;// index into the models
+    Mat aModelIndexMid;
+    Mat aModelIndexLong;
+    Mat nNextShortUpdate;//random update points per model
+    Mat nNextMidUpdate;
+    Mat nNextLongUpdate;
+
+    String name_;
+};
+
+//{ to do - paralelization ...
+//struct KNNInvoker....
+CV_INLINE void
+        _cvUpdatePixelBackgroundNP(	long pixel,const uchar* data, int nchannels, int m_nN,
+        uchar* m_aModel,
+        uchar* m_nNextLongUpdate,
+        uchar* m_nNextMidUpdate,
+        uchar* m_nNextShortUpdate,
+        uchar* m_aModelIndexLong,
+        uchar* m_aModelIndexMid,
+        uchar* m_aModelIndexShort,
+        int m_nLongCounter,
+        int m_nMidCounter,
+        int m_nShortCounter,
+        int m_nLongUpdate,
+        int m_nMidUpdate,
+        int m_nShortUpdate,
+        uchar include
+        )
+{
+    // hold the offset
+    int ndata=1+nchannels;
+    long offsetLong =  ndata * (pixel * m_nN * 3 + m_aModelIndexLong[pixel] + m_nN * 2);
+    long offsetMid =   ndata * (pixel * m_nN * 3 + m_aModelIndexMid[pixel]  + m_nN * 1);
+    long offsetShort = ndata * (pixel * m_nN * 3 + m_aModelIndexShort[pixel]);
+
+    // Long update?
+    if (m_nNextLongUpdate[pixel] == m_nLongCounter)
+    {
+        // add the oldest pixel from Mid to the list of values (for each color)
+        memcpy(&m_aModel[offsetLong],&m_aModel[offsetMid],ndata*sizeof(unsigned char));
+        // increase the index
+        m_aModelIndexLong[pixel] = (m_aModelIndexLong[pixel] >= (m_nN-1)) ? 0 : (m_aModelIndexLong[pixel] + 1);
+    };
+    if (m_nLongCounter == (m_nLongUpdate-1))
+    {
+        //m_nNextLongUpdate[pixel] = (uchar)(((m_nLongUpdate)*(rand()-1))/RAND_MAX);//0,...m_nLongUpdate-1;
+        m_nNextLongUpdate[pixel] = (uchar)( rand() % m_nLongUpdate );//0,...m_nLongUpdate-1;
+    };
+
+    // Mid update?
+    if (m_nNextMidUpdate[pixel] == m_nMidCounter)
+    {
+        // add this pixel to the list of values (for each color)
+        memcpy(&m_aModel[offsetMid],&m_aModel[offsetShort],ndata*sizeof(unsigned char));
+        // increase the index
+        m_aModelIndexMid[pixel] = (m_aModelIndexMid[pixel] >= (m_nN-1)) ? 0 : (m_aModelIndexMid[pixel] + 1);
+    };
+    if (m_nMidCounter == (m_nMidUpdate-1))
+    {
+        m_nNextMidUpdate[pixel] = (uchar)( rand() % m_nMidUpdate );
+    };
+
+    // Short update?
+    if (m_nNextShortUpdate[pixel] == m_nShortCounter)
+    {
+        // add this pixel to the list of values (for each color)
+        memcpy(&m_aModel[offsetShort],data,ndata*sizeof(unsigned char));
+        //set the include flag
+        m_aModel[offsetShort+nchannels]=include;
+        // increase the index
+        m_aModelIndexShort[pixel] = (m_aModelIndexShort[pixel] >= (m_nN-1)) ? 0 : (m_aModelIndexShort[pixel] + 1);
+    };
+    if (m_nShortCounter == (m_nShortUpdate-1))
+    {
+        m_nNextShortUpdate[pixel] = (uchar)( rand() % m_nShortUpdate );
+    };
+};
+
+CV_INLINE int
+        _cvCheckPixelBackgroundNP(long pixel,
+        const uchar* data, int nchannels,
+        int m_nN,
+        uchar* m_aModel,
+        float m_fTb,
+        int m_nkNN,
+        float tau,
+        int m_nShadowDetection,
+        uchar& include)
+{
+    int Pbf = 0; // the total probability that this pixel is background
+    int Pb = 0; //background model probability
+    float dData[CV_CN_MAX];
+
+    //uchar& include=data[nchannels];
+    include=0;//do we include this pixel into background model?
+
+    int ndata=nchannels+1;
+    long posPixel = pixel * ndata * m_nN * 3;
+//	float k;
+    // now increase the probability for each pixel
+    for (int n = 0; n < m_nN*3; n++)
+    {
+        uchar* mean_m = &m_aModel[posPixel + n*ndata];
+
+        //calculate difference and distance
+        float dist2;
+
+        if( nchannels == 3 )
+        {
+            dData[0] = (float)mean_m[0] - data[0];
+            dData[1] = (float)mean_m[1] - data[1];
+            dData[2] = (float)mean_m[2] - data[2];
+            dist2 = dData[0]*dData[0] + dData[1]*dData[1] + dData[2]*dData[2];
+        }
+        else
+        {
+            dist2 = 0.f;
+            for( int c = 0; c < nchannels; c++ )
+            {
+                dData[c] = (float)mean_m[c] - data[c];
+                dist2 += dData[c]*dData[c];
+            }
+        }
+
+        if (dist2<m_fTb)
+        {
+            Pbf++;//all
+            //background only
+            //if(m_aModel[subPosPixel + nchannels])//indicator
+            if(mean_m[nchannels])//indicator
+            {
+                Pb++;
+                if (Pb >= m_nkNN)//Tb
+                {
+                    include=1;//include
+                    return 1;//background ->exit
+                };
+            }
+        };
+    };
+
+    //include?
+    if (Pbf>=m_nkNN)//m_nTbf)
+    {
+        include=1;
+    }
+
+    int Ps = 0; // the total probability that this pixel is background shadow
+    // Detected as moving object, perform shadow detection
+    if (m_nShadowDetection)
+    {
+        for (int n = 0; n < m_nN*3; n++)
+        {
+            //long subPosPixel = posPixel + n*ndata;
+            uchar* mean_m = &m_aModel[posPixel + n*ndata];
+
+            if(mean_m[nchannels])//check only background
+            {
+                float numerator = 0.0f;
+                float denominator = 0.0f;
+                for( int c = 0; c < nchannels; c++ )
+                {
+                    numerator   += (float)data[c] * mean_m[c];
+                    denominator += (float)mean_m[c] * mean_m[c];
+                }
+
+                // no division by zero allowed
+                if( denominator == 0 )
+                    return 0;
+
+                // if tau < a < 1 then also check the color distortion
+                if( numerator <= denominator && numerator >= tau*denominator )
+                {
+                    float a = numerator / denominator;
+                    float dist2a = 0.0f;
+
+                    for( int c = 0; c < nchannels; c++ )
+                    {
+                        float dD= a*mean_m[c] - data[c];
+                        dist2a += dD*dD;
+                    }
+
+                    if (dist2a<m_fTb*a*a)
+                    {
+                        Ps++;
+                        if (Ps >= m_nkNN)//shadow
+                            return 2;
+                    };
+                };
+            };
+        };
+    }
+    return 0;
+};
+
+CV_INLINE void
+        icvUpdatePixelBackgroundNP(const Mat& _src, Mat& _dst,
+        Mat& _bgmodel,
+        Mat& _nNextLongUpdate,
+        Mat& _nNextMidUpdate,
+        Mat& _nNextShortUpdate,
+        Mat& _aModelIndexLong,
+        Mat& _aModelIndexMid,
+        Mat& _aModelIndexShort,
+        int& _nLongCounter,
+        int& _nMidCounter,
+        int& _nShortCounter,
+        int _nN,
+        float _fAlphaT,
+        float _fTb,
+        int _nkNN,
+        float _fTau,
+        int _bShadowDetection,
+        uchar nShadowDetection
+        )
+{
+    int size=_src.rows*_src.cols;
+    int nchannels = CV_MAT_CN(_src.type());
+    const uchar* pDataCurrent=_src.ptr(0);
+    uchar* pDataOutput=_dst.ptr(0);
+    //model
+    uchar* m_aModel=_bgmodel.ptr(0);
+    uchar* m_nNextLongUpdate=_nNextLongUpdate.ptr(0);
+    uchar* m_nNextMidUpdate=_nNextMidUpdate.ptr(0);
+    uchar* m_nNextShortUpdate=_nNextShortUpdate.ptr(0);
+    uchar* m_aModelIndexLong=_aModelIndexLong.ptr(0);
+    uchar* m_aModelIndexMid=_aModelIndexMid.ptr(0);
+    uchar* m_aModelIndexShort=_aModelIndexShort.ptr(0);
+
+    //some constants
+    int m_nN=_nN;
+    float m_fAlphaT=_fAlphaT;
+    float m_fTb=_fTb;//Tb - threshold on the distance
+    float m_fTau=_fTau;
+    int m_nkNN=_nkNN;
+    int m_bShadowDetection=_bShadowDetection;
+
+    //recalculate update rates - in case alpha is changed
+    // calculate update parameters (using alpha)
+    int Kshort,Kmid,Klong;
+    //approximate exponential learning curve
+    Kshort=(int)(log(0.7)/log(1-m_fAlphaT))+1;//Kshort
+    Kmid=(int)(log(0.4)/log(1-m_fAlphaT))-Kshort+1;//Kmid
+    Klong=(int)(log(0.1)/log(1-m_fAlphaT))-Kshort-Kmid+1;//Klong
+
+    //refresh rates
+    int	m_nShortUpdate = (Kshort/m_nN)+1;
+    int m_nMidUpdate = (Kmid/m_nN)+1;
+    int m_nLongUpdate = (Klong/m_nN)+1;
+
+    //int	m_nShortUpdate = MAX((Kshort/m_nN),m_nN);
+    //int m_nMidUpdate = MAX((Kmid/m_nN),m_nN);
+    //int m_nLongUpdate = MAX((Klong/m_nN),m_nN);
+
+    //update counters for the refresh rate
+    int m_nLongCounter=_nLongCounter;
+    int m_nMidCounter=_nMidCounter;
+    int m_nShortCounter=_nShortCounter;
+
+    _nShortCounter++;//0,1,...,m_nShortUpdate-1
+    _nMidCounter++;
+    _nLongCounter++;
+    if (_nShortCounter >= m_nShortUpdate) _nShortCounter = 0;
+    if (_nMidCounter >= m_nMidUpdate) _nMidCounter = 0;
+    if (_nLongCounter >= m_nLongUpdate) _nLongCounter = 0;
+
+    //go through the image
+    for (long i=0;i<size;i++)
+    {
+        const uchar* data=pDataCurrent;
+        pDataCurrent=pDataCurrent+nchannels;
+
+        //update model+ background subtract
+        uchar include=0;
+        int result= _cvCheckPixelBackgroundNP(i, data, nchannels,
+                m_nN, m_aModel, m_fTb,m_nkNN, m_fTau,m_bShadowDetection,include);
+
+        _cvUpdatePixelBackgroundNP(i,data,nchannels,
+                m_nN, m_aModel,
+                m_nNextLongUpdate,
+                m_nNextMidUpdate,
+                m_nNextShortUpdate,
+                m_aModelIndexLong,
+                m_aModelIndexMid,
+                m_aModelIndexShort,
+                m_nLongCounter,
+                m_nMidCounter,
+                m_nShortCounter,
+                m_nLongUpdate,
+                m_nMidUpdate,
+                m_nShortUpdate,
+                include
+                );
+        switch (result)
+        {
+            case 0:
+                //foreground
+                (* pDataOutput)=255;
+                break;
+            case 1:
+                //background
+                (* pDataOutput)=0;
+                break;
+            case 2:
+                //shadow
+                (* pDataOutput)=nShadowDetection;
+                break;
+        }
+        pDataOutput++;
+    }
+};
+
+
+
+void BackgroundSubtractorKNNImpl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
+{
+    Mat image = _image.getMat();
+    bool needToInitialize = nframes == 0 || learningRate >= 1 || image.size() != frameSize || image.type() != frameType;
+
+    if( needToInitialize )
+        initialize(image.size(), image.type());
+
+    _fgmask.create( image.size(), CV_8U );
+    Mat fgmask = _fgmask.getMat();
+
+    ++nframes;
+    learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
+    CV_Assert(learningRate >= 0);
+
+    //parallel_for_(Range(0, image.rows),
+    //              KNNInvoker(image, fgmask,
+    icvUpdatePixelBackgroundNP(image, fgmask,
+            bgmodel,
+            nNextLongUpdate,
+            nNextMidUpdate,
+            nNextShortUpdate,
+            aModelIndexLong,
+            aModelIndexMid,
+            aModelIndexShort,
+            nLongCounter,
+            nMidCounter,
+            nShortCounter,
+            nN,
+            (float)learningRate,
+            fTb,
+            nkNN,
+            fTau,
+            bShadowDetection,
+            nShadowDetection
+            );
+}
+
+void BackgroundSubtractorKNNImpl::getBackgroundImage(OutputArray backgroundImage) const
+{
+    int nchannels = CV_MAT_CN(frameType);
+    //CV_Assert( nchannels == 3 );
+    Mat meanBackground(frameSize, CV_8UC3, Scalar::all(0));
+
+    int ndata=nchannels+1;
+    int modelstep=(ndata * nN * 3);
+
+    const uchar* pbgmodel=bgmodel.ptr(0);
+    for(int row=0; row<meanBackground.rows; row++)
+    {
+        for(int col=0; col<meanBackground.cols; col++)
+        {
+            for (int n = 0; n < nN*3; n++)
+            {
+                const uchar* mean_m = &pbgmodel[n*ndata];
+                if (mean_m[nchannels])
+                {
+                    meanBackground.at<Vec3b>(row, col) = Vec3b(mean_m);
+                    break;
+                }
+            }
+            pbgmodel=pbgmodel+modelstep;
+        }
+    }
+
+    switch(CV_MAT_CN(frameType))
+    {
+        case 1:
+        {
+            std::vector<Mat> channels;
+            split(meanBackground, channels);
+            channels[0].copyTo(backgroundImage);
+            break;
+        }
+        case 3:
+        {
+            meanBackground.copyTo(backgroundImage);
+            break;
+        }
+        default:
+            CV_Error(Error::StsUnsupportedFormat, "");
+    }
+}
+
+
+Ptr<BackgroundSubtractorKNN> createBackgroundSubtractorKNN(int _history, double _threshold2,
+                                                           bool _bShadowDetection)
+{
+    return makePtr<BackgroundSubtractorKNNImpl>(_history, (float)_threshold2, _bShadowDetection);
+}
+
+}
+
+/* End of file. */
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 5b653c9d1..4f0f313f7 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -890,6 +890,26 @@ namespace cv
             std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
             std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
 
+            // allocate buffers with aligned pitch to be able to use cl_khr_image2d_from_buffer extention
+            // This is the required pitch alignment in pixels
+            int pitchAlign = (int)ocl::Device::getDefault().imagePitchAlignment();
+            if (pitchAlign>0)
+            {
+                prevPyr[0] = UMat(prevImg.rows,(prevImg.cols+pitchAlign-1)&(-pitchAlign),prevImg.type()).colRange(0,prevImg.cols);
+                nextPyr[0] = UMat(nextImg.rows,(nextImg.cols+pitchAlign-1)&(-pitchAlign),nextImg.type()).colRange(0,nextImg.cols);
+                for (int level = 1; level <= maxLevel; ++level)
+                {
+                    int cols,rows;
+                    // allocate buffers with aligned pitch to be able to use image on buffer extention
+                    cols = (prevPyr[level - 1].cols+1)/2;
+                    rows = (prevPyr[level - 1].rows+1)/2;
+                    prevPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),prevPyr[level-1].type()).colRange(0,cols);
+                    cols = (nextPyr[level - 1].cols+1)/2;
+                    rows = (nextPyr[level - 1].rows+1)/2;
+                    nextPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),nextPyr[level-1].type()).colRange(0,cols);
+                }
+            }
+
             prevImg.convertTo(prevPyr[0], CV_32F);
             nextImg.convertTo(nextPyr[0], CV_32F);
 
@@ -969,8 +989,10 @@ namespace cv
             if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
                 return false;
 
-            ocl::Image2D imageI(I);
-            ocl::Image2D imageJ(J);
+            CV_Assert(I.depth() == CV_32F && J.depth() == CV_32F);
+            ocl::Image2D imageI(I, false, ocl::Image2D::canCreateAlias(I));
+            ocl::Image2D imageJ(J, false, ocl::Image2D::canCreateAlias(J));
+
             int idxArg = 0;
             idxArg = kernel.set(idxArg, imageI); //image2d_t I
             idxArg = kernel.set(idxArg, imageJ); //image2d_t J
@@ -1070,7 +1092,9 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                            TermCriteria criteria,
                            int flags, double minEigThreshold )
 {
-    bool use_opencl = ocl::useOpenCL() && (_prevImg.isUMat() || _nextImg.isUMat());
+    bool use_opencl = ocl::useOpenCL() &&
+                      (_prevImg.isUMat() || _nextImg.isUMat()) &&
+                      ocl::Image2D::isFormatSupported(CV_32F, 1, false);
     if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/))
         return;
 
diff --git a/modules/video/src/opencl/pyrlk.cl b/modules/video/src/opencl/pyrlk.cl
index cf401057d..84889b448 100644
--- a/modules/video/src/opencl/pyrlk.cl
+++ b/modules/video/src/opencl/pyrlk.cl
@@ -48,6 +48,9 @@
 #define GRIDSIZE    3
 #define LSx 8
 #define LSy 8
+// defeine local memory sizes
+#define LM_W (LSx*GRIDSIZE+2)
+#define LM_H (LSy*GRIDSIZE+2)
 #define BUFFER  (LSx*LSy)
 #define BUFFER2 BUFFER>>1
 #ifndef WAVE_SIZE
@@ -224,8 +227,9 @@ inline void reduce1(float val1, __local volatile float* smem1, int tid)
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
 
 // macro to get pixel value from local memory
-#define VAL(_y,_x,_yy,_xx)    (IPatchLocal[yid+((_y)*LSy)+1+(_yy)][xid+((_x)*LSx)+1+(_xx)])
-inline void SetPatch(local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2], int TileY, int TileX,
+
+#define VAL(_y,_x,_yy,_xx)    (IPatchLocal[(yid+((_y)*LSy)+1+(_yy))*LM_W+(xid+((_x)*LSx)+1+(_xx))])
+inline void SetPatch(local float* IPatchLocal, int TileY, int TileX,
               float* Pch, float* Dx, float* Dy,
               float* A11, float* A12, float* A22, float w)
 {
@@ -266,8 +270,8 @@ inline void GetError(image2d_t J, const float x, const float y, const float* Pch
 
 
 //macro to read pixel value into local memory.
-#define READI(_y,_x)    IPatchLocal[yid+((_y)*LSy)][xid+((_x)*LSx)] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;
-void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2])
+#define READI(_y,_x)    IPatchLocal[(yid+((_y)*LSy))*LM_W+(xid+((_x)*LSx))] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;
+void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)
 {
     unsigned int xid=get_local_id(0);
     unsigned int yid=get_local_id(1);
@@ -341,7 +345,7 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
     float dIdy_patch[GRIDSIZE][GRIDSIZE];
 
     // local memory to read image with border to calc sobels
-    local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2];
+    local float IPatchLocal[LM_W*LM_H];
     ReadPatchIToLocalMem(I,prevPt,IPatchLocal);
 
     {
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index fad73ef65..914f09c73 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -352,7 +352,9 @@ OpticalFlowDual_TVL1::OpticalFlowDual_TVL1()
 
 void OpticalFlowDual_TVL1::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow)
 {
-    CV_OCL_RUN(_flow.isUMat(), calc_ocl(_I0, _I1, _flow))
+    CV_OCL_RUN(_flow.isUMat() &&
+               ocl::Image2D::isFormatSupported(CV_32F, 1, false),
+               calc_ocl(_I0, _I1, _flow))
 
     Mat I0 = _I0.getMat();
     Mat I1 = _I1.getMat();
diff --git a/modules/viz/doc/widget.rst b/modules/viz/doc/widget.rst
index 906adf9ba..8c5f0c0cf 100644
--- a/modules/viz/doc/widget.rst
+++ b/modules/viz/doc/widget.rst
@@ -898,6 +898,10 @@ This 3D Widget defines a point cloud. ::
         WCloud(InputArray cloud, InputArray colors);
         //! All points in cloud have the same color
         WCloud(InputArray cloud, const Color &color = Color::white());
+        //! Each point in cloud is mapped to a color in colors, normals are used for shading
+        WCloud(InputArray cloud, InputArray colors, InputArray normals);
+        //! All points in cloud have the same color, normals are used for shading
+        WCloud(InputArray cloud, const Color &color, InputArray normals);
     };
 
 viz::WCloud::WCloud
@@ -918,6 +922,22 @@ Constructs a WCloud.
 
     Points in the cloud belong to mask when they are set to (NaN, NaN, NaN).
 
+.. ocv:function:: WCloud(InputArray cloud, InputArray colors, InputArray normals)
+
+    :param cloud: Set of points which can be of type: ``CV_32FC3``, ``CV_32FC4``, ``CV_64FC3``, ``CV_64FC4``.
+    :param colors: Set of colors. It has to be of the same size with cloud.
+    :param normals: Normals for each point in cloud. Size and type should match with the cloud parameter.
+
+    Points in the cloud belong to mask when they are set to (NaN, NaN, NaN).
+
+.. ocv:function:: WCloud(InputArray cloud, const Color &color, InputArray normals)
+
+    :param cloud: Set of points which can be of type: ``CV_32FC3``, ``CV_32FC4``, ``CV_64FC3``, ``CV_64FC4``.
+    :param color: A single :ocv:class:`Color` for the whole cloud.
+    :param normals: Normals for each point in cloud. Size and type should match with the cloud parameter.
+
+    Points in the cloud belong to mask when they are set to (NaN, NaN, NaN).
+
 .. note:: In case there are four channels in the cloud, fourth channel is ignored.
 
 viz::WCloudCollection
diff --git a/modules/viz/include/opencv2/viz/widgets.hpp b/modules/viz/include/opencv2/viz/widgets.hpp
index 2949598c5..611db5449 100644
--- a/modules/viz/include/opencv2/viz/widgets.hpp
+++ b/modules/viz/include/opencv2/viz/widgets.hpp
@@ -320,8 +320,15 @@ namespace cv
         public:
             //! Each point in cloud is mapped to a color in colors
             WCloud(InputArray cloud, InputArray colors);
+
             //! All points in cloud have the same color
             WCloud(InputArray cloud, const Color &color = Color::white());
+
+            //! Each point in cloud is mapped to a color in colors, normals are used for shading
+            WCloud(InputArray cloud, InputArray colors, InputArray normals);
+
+            //! All points in cloud have the same color, normals are used for shading
+            WCloud(InputArray cloud, const Color &color, InputArray normals);
         };
 
         class CV_EXPORTS WPaintedCloud: public Widget3D
diff --git a/modules/viz/src/clouds.cpp b/modules/viz/src/clouds.cpp
index eec02639e..48d057d2a 100644
--- a/modules/viz/src/clouds.cpp
+++ b/modules/viz/src/clouds.cpp
@@ -49,11 +49,29 @@
 /// Point Cloud Widget implementation
 
 cv::viz::WCloud::WCloud(InputArray cloud, InputArray colors)
+{
+    WCloud cloud_widget(cloud, colors, cv::noArray());
+    *this = cloud_widget;
+}
+
+cv::viz::WCloud::WCloud(InputArray cloud, const Color &color)
+{
+    WCloud cloud_widget(cloud, Mat(cloud.size(), CV_8UC3, color));
+    *this = cloud_widget;
+}
+
+cv::viz::WCloud::WCloud(InputArray cloud, const Color &color, InputArray normals)
+{
+    WCloud cloud_widget(cloud, Mat(cloud.size(), CV_8UC3, color), normals);
+    *this = cloud_widget;
+}
+
+cv::viz::WCloud::WCloud(cv::InputArray cloud, cv::InputArray colors, cv::InputArray normals)
 {
     CV_Assert(!cloud.empty() && !colors.empty());
 
     vtkSmartPointer<vtkCloudMatSource> cloud_source = vtkSmartPointer<vtkCloudMatSource>::New();
-    cloud_source->SetColorCloud(cloud, colors);
+    cloud_source->SetColorCloudNormals(cloud, colors, normals);
     cloud_source->Update();
 
     vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
@@ -69,12 +87,7 @@ cv::viz::WCloud::WCloud(InputArray cloud, InputArray colors)
     actor->SetMapper(mapper);
 
     WidgetAccessor::setProp(*this, actor);
-}
 
-cv::viz::WCloud::WCloud(InputArray cloud, const Color &color)
-{
-    WCloud cloud_widget(cloud, Mat(cloud.size(), CV_8UC3, color));
-    *this = cloud_widget;
 }
 
 
diff --git a/modules/viz/test/tests_simple.cpp b/modules/viz/test/tests_simple.cpp
index 8c944bb22..4b01387fa 100644
--- a/modules/viz/test/tests_simple.cpp
+++ b/modules/viz/test/tests_simple.cpp
@@ -248,6 +248,22 @@ TEST(Viz, show_sampled_normals)
     viz.spin();
 }
 
+TEST(Viz, show_cloud_shaded_by_normals)
+{
+    Mesh mesh = Mesh::load(get_dragon_ply_file_path());
+    computeNormals(mesh, mesh.normals);
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    WCloud cloud(mesh.cloud, Color::white(), mesh.normals);
+    cloud.setRenderingProperty(SHADING, SHADING_GOURAUD);
+
+    Viz3d viz("show_cloud_shaded_by_normals");
+    viz.showWidget("cloud", cloud, pose);
+    viz.showWidget("text2d", WText("Cloud shaded by normals", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
 TEST(Viz, show_trajectories)
 {
     std::vector<Affine3d> path = generate_test_trajectory<double>(), sub0, sub1, sub2, sub3, sub4, sub5;
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index 5eb3df46c..df0a9abc8 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -74,9 +74,6 @@ static void printUsage()
         "  --try_cuda (yes|no)\n"
         "      Try to use CUDA. The default value is 'no'. All default values\n"
         "      are for CPU mode.\n"
-        "  --try_ocl (yes|no)\n"
-        "      Try to use OpenCL. The default value is 'no'. All default values\n"
-        "      are for CPU mode.\n"
         "\nMotion Estimation Flags:\n"
         "  --work_megapix <float>\n"
         "      Resolution for image registration step. The default is 0.6 Mpx.\n"
@@ -127,7 +124,6 @@ static void printUsage()
 vector<String> img_names;
 bool preview = false;
 bool try_cuda = false;
-bool try_ocl = false;
 double work_megapix = 0.6;
 double seam_megapix = 0.1;
 double compose_megapix = -1;
@@ -178,19 +174,6 @@ static int parseCmdArgs(int argc, char** argv)
             }
             i++;
         }
-        else if (string(argv[i]) == "--try_ocl")
-        {
-            if (string(argv[i + 1]) == "no")
-                try_ocl = false;
-            else if (string(argv[i + 1]) == "yes")
-                try_ocl = true;
-            else
-            {
-                cout << "Bad --try_ocl flag value\n";
-                return -1;
-            }
-            i++;
-        }
         else if (string(argv[i]) == "--work_megapix")
         {
             work_megapix = atof(argv[i + 1]);
@@ -348,7 +331,9 @@ int main(int argc, char* argv[])
     int64 app_start_time = getTickCount();
 #endif
 
+#if 0
     cv::setBreakOnError(true);
+#endif
 
     int retval = parseCmdArgs(argc, argv);
     if (retval)
@@ -554,10 +539,10 @@ int main(int argc, char* argv[])
 #endif
 
     vector<Point> corners(num_images);
-    vector<Mat> masks_warped(num_images);
-    vector<Mat> images_warped(num_images);
+    vector<UMat> masks_warped(num_images);
+    vector<UMat> images_warped(num_images);
     vector<Size> sizes(num_images);
-    vector<Mat> masks(num_images);
+    vector<UMat> masks(num_images);
 
     // Preapre images masks
     for (int i = 0; i < num_images; ++i)
@@ -569,17 +554,8 @@ int main(int argc, char* argv[])
     // Warp images and their masks
 
     Ptr<WarperCreator> warper_creator;
-    if (try_ocl)
-    {
-        if (warp_type == "plane")
-            warper_creator = makePtr<cv::PlaneWarperOcl>();
-        else if (warp_type == "cylindrical")
-            warper_creator = makePtr<cv::CylindricalWarperOcl>();
-        else if (warp_type == "spherical")
-            warper_creator = makePtr<cv::SphericalWarperOcl>();
-    }
 #ifdef HAVE_OPENCV_CUDAWARPING
-    else if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
+    if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
     {
         if (warp_type == "plane")
             warper_creator = makePtr<cv::PlaneWarperGpu>();
@@ -645,7 +621,7 @@ int main(int argc, char* argv[])
         warper->warp(masks[i], K, cameras[i].R, INTER_NEAREST, BORDER_CONSTANT, masks_warped[i]);
     }
 
-    vector<Mat> images_warped_f(num_images);
+    vector<UMat> images_warped_f(num_images);
     for (int i = 0; i < num_images; ++i)
         images_warped[i].convertTo(images_warped_f[i], CV_32F);