diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fff2d614..2fb1cf7f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,6 +156,7 @@ OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              ON
 OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_DIRECTX        "Include DirectX support"                     ON   IF WIN32 )
+OCV_OPTION(WITH_INTELPERC      "Include Intel Perceptual Computing support"  OFF  IF WIN32 )
 
 
 # OpenCV build components
@@ -207,7 +208,8 @@ OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"
 OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND ARM) )
+OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF CMAKE_COMPILER_IS_GNUCXX AND ARM )
+OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND ARM )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
 OCV_OPTION(ENABLE_WINRT_MODE          "Build with Windows Runtime support"                       OFF  IF WIN32 )
@@ -226,6 +228,15 @@ include(cmake/OpenCVVersion.cmake)
 # Save libs and executables in the same place
 set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" )
 
+if (ANDROID)
+  if (ANDROID_ABI MATCHES "NEON")
+    set(ENABLE_NEON ON)
+  endif()
+  if (ANDROID_ABI MATCHES "VFPV3")
+    set(ENABLE_VFPV3 ON)
+  endif()
+endif()
+
 if(ANDROID OR WIN32)
   set(OPENCV_DOC_INSTALL_PATH doc)
 elseif(INSTALL_TO_MANGLED_PATHS)
@@ -820,6 +831,11 @@ if(DEFINED WITH_XINE)
   status("    Xine:"           HAVE_XINE           THEN "YES (ver ${ALIASOF_libxine_VERSION})"     ELSE NO)
 endif(DEFINED WITH_XINE)
 
+if(DEFINED WITH_INTELPERC)
+  status("    Intel PerC:"     HAVE_INTELPERC      THEN "YES"                                 ELSE NO)
+endif(DEFINED WITH_INTELPERC)
+
+
 # ========================== Other third-party libraries ==========================
 status("")
 status("  Other third-party libraries:")
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index fd36a45c6..59b19b601 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -124,6 +124,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(ENABLE_SSE2)
     add_extra_compiler_option(-msse2)
   endif()
+  if (ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=neon")
+  endif()
+  if (ENABLE_VFPV3 AND NOT ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=vfpv3")
+  endif()
 
   # SSE3 and further should be disabled under MingW because it generates compiler errors
   if(NOT MINGW)
diff --git a/cmake/OpenCVFindIntelPerCSDK.cmake b/cmake/OpenCVFindIntelPerCSDK.cmake
new file mode 100644
index 000000000..724310560
--- /dev/null
+++ b/cmake/OpenCVFindIntelPerCSDK.cmake
@@ -0,0 +1,20 @@
+# Main variables:
+# INTELPERC_LIBRARIES and INTELPERC_INCLUDE to link Intel Perceptial Computing SDK modules
+# HAVE_INTELPERC for conditional compilation OpenCV with/without Intel Perceptial Computing SDK
+
+if(X86_64)
+    find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers")
+    find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/x64" DOC "Path to Intel Perceptual Computing SDK interface libraries")
+else()
+    find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers")
+    find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Path to Intel Perceptual Computing SDK interface libraries")
+endif()
+
+if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES)
+    set(HAVE_INTELPERC TRUE)
+else()
+    set(HAVE_INTELPERC FALSE)
+    message(WARNING "Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries.")
+endif() #if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES)
+
+mark_as_advanced(FORCE INTELPERC_LIBRARIES INTELPERC_INCLUDE_DIR)
\ No newline at end of file
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index 807f4fbbf..93cce2b7a 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -277,3 +277,8 @@ if (NOT IOS)
     set(HAVE_QTKIT YES)
   endif()
 endif()
+
+# --- Intel Perceptual Computing SDK ---
+if(WITH_INTELPERC)
+  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake")
+endif(WITH_INTELPERC)
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index 554b91cef..e1beaada7 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -88,6 +88,9 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #cmakedefine HAVE_INTTYPES_H 1
 
+/* Intel Perceptual Computing SDK library */
+#cmakedefine HAVE_INTELPERC
+
 /* Intel Integrated Performance Primitives */
 #cmakedefine HAVE_IPP
 
diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
index 0b2253ace..87f6d9d4d 100644
--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
@@ -106,8 +106,8 @@ Enable hardware optimizations
 -----------------------------
 
 Depending on target platform architecture different instruction sets can be used. By default
-compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON``
-to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using
+compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DENABLE_VFPV3=ON``
+to cmake command line to enable code generation for VFPv3 and ``-DENABLE_NEON=ON`` for using
 NEON SIMD extensions.
 
 TBB is supported on multi core ARM SoCs also.
diff --git a/doc/user_guide/ug_intelperc.rst b/doc/user_guide/ug_intelperc.rst
new file mode 100644
index 000000000..bae5f7014
--- /dev/null
+++ b/doc/user_guide/ug_intelperc.rst
@@ -0,0 +1,79 @@
+*******
+HighGUI
+*******
+
+.. highlight:: cpp
+
+Using Creative Senz3D and other Intel Perceptual Computing SDK compatible depth sensors
+=======================================================================================
+
+Depth sensors compatible with Intel Perceptual Computing SDK are supported through ``VideoCapture`` class. Depth map, RGB image and some other formats of output can be retrieved by using familiar interface of ``VideoCapture``.
+
+In order to use depth sensor with OpenCV you should do the following preliminary steps:
+
+#.
+    Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual).
+
+#.
+    Configure OpenCV with Intel Perceptual Computing SDK support by setting ``WITH_INTELPERC`` flag in CMake. If Intel Perceptual Computing SDK is found in install folders OpenCV will be built with Intel Perceptual Computing SDK library (see a status ``INTELPERC`` in CMake log). If CMake process doesn't find Intel Perceptual Computing SDK installation folder automatically, the user should change corresponding CMake variables ``INTELPERC_LIB_DIR`` and ``INTELPERC_INCLUDE_DIR`` to the proper value.
+
+#.
+    Build OpenCV.
+
+VideoCapture can retrieve the following data:
+
+#.
+    data given from depth generator:
+      * ``CV_CAP_INTELPERC_DEPTH_MAP``       - each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. (CV_16UC1)
+      * ``CV_CAP_INTELPERC_UVDEPTH_MAP``     - each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. (CV_32FC2)
+      * ``CV_CAP_INTELPERC_IR_MAP``          - each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. (CV_16UC1)
+#.
+    data given from RGB image generator:
+      * ``CV_CAP_INTELPERC_IMAGE``           - color image. (CV_8UC3)
+
+In order to get depth map from depth sensor use ``VideoCapture::operator >>``, e. g. ::
+
+    VideoCapture capture( CV_CAP_INTELPERC );
+    for(;;)
+    {
+        Mat depthMap;
+        capture >> depthMap;
+
+        if( waitKey( 30 ) >= 0 )
+            break;
+    }
+
+For getting several data maps use ``VideoCapture::grab`` and ``VideoCapture::retrieve``, e.g. ::
+
+    VideoCapture capture(CV_CAP_INTELPERC);
+    for(;;)
+    {
+        Mat depthMap;
+        Mat image;
+        Mat irImage;
+
+        capture.grab();
+
+        capture.retrieve( depthMap, CV_CAP_INTELPERC_DEPTH_MAP );
+        capture.retrieve(    image, CV_CAP_INTELPERC_IMAGE );
+        capture.retrieve(  irImage, CV_CAP_INTELPERC_IR_MAP);
+
+        if( waitKey( 30 ) >= 0 )
+            break;
+    }
+
+For setting and getting some property of sensor` data generators use ``VideoCapture::set`` and ``VideoCapture::get`` methods respectively, e.g. ::
+
+    VideoCapture capture( CV_CAP_INTELPERC );
+    capture.set( CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0 );
+    cout << "FPS    " << capture.get( CV_CAP_INTELPERC_DEPTH_GENERATOR+CV_CAP_PROP_FPS ) << endl;
+
+Since two types of sensor's data generators are supported (image generator and depth generator), there are two flags that should be used to set/get property of the needed generator:
+
+* CV_CAP_INTELPERC_IMAGE_GENERATOR -- a flag for access to the image generator properties.
+
+* CV_CAP_INTELPERC_DEPTH_GENERATOR -- a flag for access to the depth generator properties. This flag value is assumed by default if neither of the two possible values of the property is set.
+
+For more information please refer to the example of usage intelperc_capture.cpp_ in ``opencv/samples/cpp`` folder.
+
+.. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp
\ No newline at end of file
diff --git a/doc/user_guide/user_guide.rst b/doc/user_guide/user_guide.rst
index de9edcb68..76cf756f8 100644
--- a/doc/user_guide/user_guide.rst
+++ b/doc/user_guide/user_guide.rst
@@ -9,3 +9,4 @@ OpenCV User Guide
    ug_features2d.rst
    ug_highgui.rst
    ug_traincascade.rst
+   ug_intelperc.rst
diff --git a/modules/core/doc/operations_on_arrays.rst b/modules/core/doc/operations_on_arrays.rst
index a894d0768..c936457af 100644
--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@@ -903,7 +903,7 @@ So, the function chooses an operation mode depending on the flags and size of th
 
     * When ``DFT_COMPLEX_OUTPUT`` is set, the output is a complex matrix of the same size as input.
 
-    * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DCT_ROWS``         flag), each row of the output matrix looks like the first row of the matrix above.
+    * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DFT_ROWS``         flag), each row of the output matrix looks like the first row of the matrix above.
 
  * If the input array is complex and either ``DFT_INVERSE``     or ``DFT_REAL_OUTPUT``     are not set, the output is a complex array of the same size as input. The function performs a forward or inverse 1D or 2D transform of the whole input array or each row of the input array independently, depending on the flags ``DFT_INVERSE`` and ``DFT_ROWS``.
 
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index c39f11d4f..acac45c52 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -2577,7 +2577,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
 
     DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2];
 
-    if( (flags & DFT_ROWS) || src.rows == 1 ||
+    if( (flags & DCT_ROWS) || src.rows == 1 ||
         (src.cols == 1 && (src.isContinuous() && dst.isContinuous())))
     {
         stage = end_stage = 0;
@@ -2597,7 +2597,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
         {
             len = src.cols;
             count = src.rows;
-            if( len == 1 && !(flags & DFT_ROWS) )
+            if( len == 1 && !(flags & DCT_ROWS) )
             {
                 len = src.rows;
                 count = 1;
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 6f2580498..d4db98119 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2760,39 +2760,24 @@ void cv::transpose( InputArray _src, OutputArray _dst )
 }
 
 
+////////////////////////////////////// completeSymm /////////////////////////////////////////
+
 void cv::completeSymm( InputOutputArray _m, bool LtoR )
 {
     Mat m = _m.getMat();
-    CV_Assert( m.dims <= 2 );
+    size_t step = m.step, esz = m.elemSize();
+    CV_Assert( m.dims <= 2 && m.rows == m.cols );
 
-    int i, j, nrows = m.rows, type = m.type();
-    int j0 = 0, j1 = nrows;
-    CV_Assert( m.rows == m.cols );
+    int rows = m.rows;
+    int j0 = 0, j1 = rows;
 
-    if( type == CV_32FC1 || type == CV_32SC1 )
+    uchar* data = m.data;
+    for( int i = 0; i < rows; i++ )
     {
-        int* data = (int*)m.data;
-        size_t step = m.step/sizeof(data[0]);
-        for( i = 0; i < nrows; i++ )
-        {
-            if( !LtoR ) j1 = i; else j0 = i+1;
-            for( j = j0; j < j1; j++ )
-                data[i*step + j] = data[j*step + i];
-        }
+        if( !LtoR ) j1 = i; else j0 = i+1;
+        for( int j = j0; j < j1; j++ )
+            memcpy(data + (i*step + j*esz), data + (j*step + i*esz), esz);
     }
-    else if( type == CV_64FC1 )
-    {
-        double* data = (double*)m.data;
-        size_t step = m.step/sizeof(data[0]);
-        for( i = 0; i < nrows; i++ )
-        {
-            if( !LtoR ) j1 = i; else j0 = i+1;
-            for( j = j0; j < j1; j++ )
-                data[i*step + j] = data[j*step + i];
-        }
-    }
-    else
-        CV_Error( CV_StsUnsupportedFormat, "" );
 }
 
 
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 51ab0c3ef..ea8aefde5 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -222,6 +222,12 @@ elseif(HAVE_QTKIT)
   list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit")
 endif()
 
+if(HAVE_INTELPERC)
+  list(APPEND highgui_srcs src/cap_intelperc.cpp)
+  ocv_include_directories(${INTELPERC_INCLUDE_DIR})
+  list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARIES})
+endif(HAVE_INTELPERC)
+
 if(IOS)
   add_definitions(-DHAVE_IOS=1)
   list(APPEND highgui_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp
index cebf8fe22..eb4ee8c03 100644
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -271,7 +271,8 @@ enum { CAP_ANY          = 0,     // autodetect
        CAP_XIAPI        = 1100,  // XIMEA Camera API
        CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
        CAP_GIGANETIX    = 1300,  // Smartek Giganetix GigEVisionSDK
-       CAP_MSMF         = 1400   // Microsoft Media Foundation (via videoInput)
+       CAP_MSMF         = 1400,  // Microsoft Media Foundation (via videoInput)
+       CAP_INTELPERC    = 1500   // Intel Perceptual Computing SDK
      };
 
 // generic properties (based on DC1394 properties)
@@ -496,6 +497,26 @@ enum { CAP_PROP_GIGA_FRAME_OFFSET_X   = 10001,
        CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
      };
 
+enum { CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
+       CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
+       CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
+       CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
+       CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
+       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
+       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007
+     };
+
+// Intel PerC streams
+enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
+       CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
+       CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR
+     };
+
+enum { CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+       CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+       CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+       CAP_INTELPERC_IMAGE                  = 3
+     };
 
 class CV_EXPORTS_W VideoCapture
 {
diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h
index 2ebea0d30..1a42e5804 100644
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@@ -313,7 +313,9 @@ enum
 
     CV_CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
 
-    CV_CAP_GIGANETIX = 1300  // Smartek Giganetix GigEVisionSDK
+    CV_CAP_GIGANETIX = 1300,  // Smartek Giganetix GigEVisionSDK
+
+    CV_CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK
 };
 
 /* start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*) */
@@ -459,16 +461,29 @@ enum
     CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002,
     CV_CAP_PROP_IOS_DEVICE_FLASH = 9003,
     CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004,
-    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005
+    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005,
 
     // Properties of cameras available through Smartek Giganetix Ethernet Vision interface
     /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
-    ,CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
+    CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
     CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002,
     CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003,
     CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004,
     CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005,
-    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
+    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006,
+
+    CV_CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
+    CV_CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
+    CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
+    CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
+    CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
+    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
+    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007,
+
+    // Intel PerC streams
+    CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
+    CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
+    CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR
 };
 
 enum
@@ -549,6 +564,14 @@ enum
     CV_CAP_ANDROID_ANTIBANDING_OFF
 };
 
+enum
+{
+    CV_CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+    CV_CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+    CV_CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+    CV_CAP_INTELPERC_IMAGE                  = 3
+};
+
 /* retrieve or set capture properties */
 CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id );
 CVAPI(int)    cvSetCaptureProperty( CvCapture* capture, int property_id, double value );
diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp
index 0f4e6afb8..105f92e8c 100644
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@@ -155,6 +155,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 #endif
 #ifdef HAVE_GIGE_API
         CV_CAP_GIGANETIX,
+#endif
+#ifdef HAVE_INTELPERC
+        CV_CAP_INTELPERC,
 #endif
         -1
     };
@@ -193,6 +196,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_ANDROID_NATIVE_CAMERA) || \
     defined(HAVE_GIGE_API) || \
+    defined(HAVE_INTELPERC)    || \
     (0)
         // local variable to memorize the captured device
         CvCapture *capture;
@@ -342,6 +346,13 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
         break; // CV_CAP_GIGANETIX
 #endif
 
+#ifdef HAVE_INTELPERC
+        case CV_CAP_INTELPERC:
+            capture = cvCreateCameraCapture_IntelPerC(index);
+            if (capture)
+                return capture;
+        break; // CV_CAP_INTEL_PERC
+#endif
         }
     }
 
diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp
new file mode 100644
index 000000000..368f4fd2c
--- /dev/null
+++ b/modules/highgui/src/cap_intelperc.cpp
@@ -0,0 +1,714 @@
+#include "precomp.hpp"
+
+#ifdef HAVE_INTELPERC
+
+#include "pxcsession.h"
+#include "pxcsmartptr.h"
+#include "pxccapture.h"
+
+class CvIntelPerCStreamBase
+{
+protected:
+    struct FrameInternal
+    {
+        IplImage* retrieveFrame()
+        {
+            if (m_mat.empty())
+                return NULL;
+            m_iplHeader = IplImage(m_mat);
+            return &m_iplHeader;
+        }
+        cv::Mat m_mat;
+    private:
+        IplImage m_iplHeader;
+    };
+public:
+    CvIntelPerCStreamBase()
+        : m_profileIdx(-1)
+        , m_frameIdx(0)
+        , m_timeStampStartNS(0)
+    {
+    }
+    virtual ~CvIntelPerCStreamBase()
+    {
+    }
+
+    bool isValid()
+    {
+        return (m_device.IsValid() && m_stream.IsValid());
+    }
+    bool grabFrame()
+    {
+        if (!m_stream.IsValid())
+            return false;
+        if (-1 == m_profileIdx)
+        {
+            if (!setProperty(CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0))
+                return false;
+        }
+        PXCSmartPtr<PXCImage> pxcImage; PXCSmartSP sp;
+        if (PXC_STATUS_NO_ERROR > m_stream->ReadStreamAsync(&pxcImage, &sp))
+            return false;
+        if (PXC_STATUS_NO_ERROR > sp->Synchronize())
+            return false;
+        if (0 == m_timeStampStartNS)
+            m_timeStampStartNS = pxcImage->QueryTimeStamp();
+        m_timeStamp = (double)((pxcImage->QueryTimeStamp() - m_timeStampStartNS) / 10000);
+        m_frameIdx++;
+        return prepareIplImage(pxcImage);
+    }
+    int getProfileIDX() const
+    {
+        return m_profileIdx;
+    }
+public:
+    virtual bool initStream(PXCSession *session)            = 0;
+    virtual double getProperty(int propIdx)
+    {
+        double ret = 0.0;
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_PROFILE_COUNT:
+            ret = (double)m_profiles.size();
+            break;
+        case CV_CAP_PROP_FRAME_WIDTH :
+            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
+                ret = (double)m_profiles[m_profileIdx].imageInfo.width;
+            break;
+        case CV_CAP_PROP_FRAME_HEIGHT :
+            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
+                ret = (double)m_profiles[m_profileIdx].imageInfo.height;
+            break;
+        case CV_CAP_PROP_FPS :
+            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
+            {
+                ret = ((double)m_profiles[m_profileIdx].frameRateMin.numerator / (double)m_profiles[m_profileIdx].frameRateMin.denominator
+                        + (double)m_profiles[m_profileIdx].frameRateMax.numerator / (double)m_profiles[m_profileIdx].frameRateMax.denominator) / 2.0;
+            }
+            break;
+        case CV_CAP_PROP_POS_FRAMES:
+            ret  = (double)m_frameIdx;
+            break;
+        case CV_CAP_PROP_POS_MSEC:
+            ret  = m_timeStamp;
+            break;
+        };
+        return ret;
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        bool isSet = false;
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_PROFILE_IDX:
+            {
+                int propValInt = (int)propVal;
+                if ((0 <= propValInt) && (propValInt < m_profiles.size()))
+                {
+                    if (m_profileIdx != propValInt)
+                    {
+                        m_profileIdx = propValInt;
+                        if (m_stream.IsValid())
+                            m_stream->SetProfile(&m_profiles[m_profileIdx]);
+                        m_frameIdx = 0;
+                        m_timeStampStartNS = 0;
+                    }
+                    isSet = true;
+                }
+            }
+            break;
+        };
+        return isSet;
+    }
+protected:
+    PXCSmartPtr<PXCCapture::Device> m_device;
+    bool initDevice(PXCSession *session)
+    {
+        if (NULL == session)
+            return false;
+
+        pxcStatus sts = PXC_STATUS_NO_ERROR;
+        PXCSession::ImplDesc templat;
+        memset(&templat,0,sizeof(templat));
+        templat.group   = PXCSession::IMPL_GROUP_SENSOR;
+        templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE;
+
+        for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++)
+        {
+            PXCSession::ImplDesc desc;
+            sts = session->QueryImpl(&templat, modidx, &desc);
+            if (PXC_STATUS_NO_ERROR > sts)
+                break;
+
+            PXCSmartPtr<PXCCapture> capture;
+            sts = session->CreateImpl<PXCCapture>(&desc, &capture);
+            if (!capture.IsValid())
+                continue;
+
+            /* enumerate devices */
+            for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++)
+            {
+                PXCSmartPtr<PXCCapture::Device> device;
+                sts = capture->CreateDevice(devidx, &device);
+                if (PXC_STATUS_NO_ERROR <= sts)
+                {
+                    m_device = device.ReleasePtr();
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    PXCSmartPtr<PXCCapture::VideoStream> m_stream;
+    void initStreamImpl(PXCImage::ImageType type)
+    {
+        if (!m_device.IsValid())
+            return;
+
+        pxcStatus sts = PXC_STATUS_NO_ERROR;
+        /* enumerate streams */
+        for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++)
+        {
+            PXCCapture::Device::StreamInfo sinfo;
+            sts = m_device->QueryStream(streamidx, &sinfo);
+            if (PXC_STATUS_NO_ERROR > sts)
+                break;
+            if (PXCCapture::VideoStream::CUID != sinfo.cuid)
+                continue;
+            if (type != sinfo.imageType)
+                continue;
+
+            sts = m_device->CreateStream<PXCCapture::VideoStream>(streamidx, &m_stream);
+            if (PXC_STATUS_NO_ERROR == sts)
+                break;
+            m_stream.ReleaseRef();
+        }
+    }
+protected:
+    std::vector<PXCCapture::VideoStream::ProfileInfo> m_profiles;
+    int m_profileIdx;
+    int m_frameIdx;
+    pxcU64 m_timeStampStartNS;
+    double m_timeStamp;
+
+    virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& /*pinfo*/)
+    {
+        return true;
+    }
+    void enumProfiles()
+    {
+        m_profiles.clear();
+        if (!m_stream.IsValid())
+            return;
+        pxcStatus sts = PXC_STATUS_NO_ERROR;
+        for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++)
+        {
+            PXCCapture::VideoStream::ProfileInfo pinfo;
+            sts = m_stream->QueryProfile(profidx, &pinfo);
+            if (PXC_STATUS_NO_ERROR > sts)
+                break;
+            if (validProfile(pinfo))
+                m_profiles.push_back(pinfo);
+        }
+    }
+    virtual bool prepareIplImage(PXCImage *pxcImage) = 0;
+};
+
+class CvIntelPerCStreamImage
+    : public CvIntelPerCStreamBase
+{
+public:
+    CvIntelPerCStreamImage()
+    {
+    }
+    virtual ~CvIntelPerCStreamImage()
+    {
+    }
+
+    virtual bool initStream(PXCSession *session)
+    {
+        if (!initDevice(session))
+            return false;
+        initStreamImpl(PXCImage::IMAGE_TYPE_COLOR);
+        if (!m_stream.IsValid())
+            return false;
+        enumProfiles();
+        return true;
+    }
+    virtual double getProperty(int propIdx)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_BRIGHTNESS:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_CONTRAST:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_SATURATION:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_HUE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_GAMMA:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_SHARPNESS:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_GAIN:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_BACKLIGHT:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_EXPOSURE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        //Add image stream specific properties
+        }
+        return CvIntelPerCStreamBase::getProperty(propIdx);
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_BRIGHTNESS:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_CONTRAST:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_SATURATION:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_HUE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_GAMMA:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_SHARPNESS:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_GAIN:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_BACKLIGHT:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_EXPOSURE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, (float)propVal));
+            }
+            break;
+        //Add image stream specific properties
+        }
+        return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
+    }
+public:
+    IplImage* retrieveFrame()
+    {
+        return m_frame.retrieveFrame();
+    }
+protected:
+    FrameInternal m_frame;
+    bool prepareIplImage(PXCImage *pxcImage)
+    {
+        if (NULL == pxcImage)
+            return false;
+        PXCImage::ImageInfo info;
+        pxcImage->QueryInfo(&info);
+
+        PXCImage::ImageData data;
+        pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data);
+
+        if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
+            return false;
+
+        cv::Mat temp(info.height, info.width, CV_8UC3, data.planes[0], data.pitches[0]);
+        temp.copyTo(m_frame.m_mat);
+
+        pxcImage->ReleaseAccess(&data);
+        return true;
+    }
+};
+
+class CvIntelPerCStreamDepth
+    : public CvIntelPerCStreamBase
+{
+public:
+    CvIntelPerCStreamDepth()
+    {
+    }
+    virtual ~CvIntelPerCStreamDepth()
+    {
+    }
+
+    virtual bool initStream(PXCSession *session)
+    {
+        if (!initDevice(session))
+            return false;
+        initStreamImpl(PXCImage::IMAGE_TYPE_DEPTH);
+        if (!m_stream.IsValid())
+            return false;
+        enumProfiles();
+        return true;
+    }
+    virtual double getProperty(int propIdx)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ:
+            {
+                if (!m_device.IsValid())
+                    return 0.0f;
+                PXCPointF32 ptf;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
+                    return (double)ptf.x;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT:
+            {
+                if (!m_device.IsValid())
+                    return 0.0f;
+                PXCPointF32 ptf;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
+                    return (double)ptf.y;
+                return 0.0;
+            }
+            break;
+            //Add depth stream sepcific properties
+        }
+        return CvIntelPerCStreamBase::getProperty(propIdx);
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, (float)propVal));
+            }
+            break;
+        //Add depth stream sepcific properties
+        }
+        return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
+    }
+public:
+    IplImage* retrieveDepthFrame()
+    {
+        return m_frameDepth.retrieveFrame();
+    }
+    IplImage* retrieveIRFrame()
+    {
+        return m_frameIR.retrieveFrame();
+    }
+    IplImage* retrieveUVFrame()
+    {
+        return m_frameUV.retrieveFrame();
+    }
+protected:
+    virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo)
+    {
+        return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format);
+    }
+protected:
+    FrameInternal m_frameDepth;
+    FrameInternal m_frameIR;
+    FrameInternal m_frameUV;
+
+    bool prepareIplImage(PXCImage *pxcImage)
+    {
+        if (NULL == pxcImage)
+            return false;
+        PXCImage::ImageInfo info;
+        pxcImage->QueryInfo(&info);
+
+        PXCImage::ImageData data;
+        pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data);
+
+        if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
+            return false;
+
+        if (PXCImage::COLOR_FORMAT_DEPTH != data.format)
+            return false;
+
+        {
+            cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[0], data.pitches[0]);
+            temp.copyTo(m_frameDepth.m_mat);
+        }
+        {
+            cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[1], data.pitches[1]);
+            temp.copyTo(m_frameIR.m_mat);
+        }
+        {
+            cv::Mat temp(info.height, info.width, CV_32FC2, data.planes[2], data.pitches[2]);
+            temp.copyTo(m_frameUV.m_mat);
+        }
+
+        pxcImage->ReleaseAccess(&data);
+        return true;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class CvCapture_IntelPerC : public CvCapture
+{
+public:
+    CvCapture_IntelPerC(int /*index*/)
+        : m_contextOpened(false)
+    {
+        pxcStatus sts = PXCSession_Create(&m_session);
+        if (PXC_STATUS_NO_ERROR > sts)
+            return;
+        m_contextOpened = m_imageStream.initStream(m_session);
+        m_contextOpened &= m_depthStream.initStream(m_session);
+    }
+    virtual ~CvCapture_IntelPerC(){}
+
+    virtual double getProperty(int propIdx)
+    {
+        double propValue = 0;
+        int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
+        if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            propValue = m_imageStream.getProperty(purePropIdx);
+        }
+        else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            propValue = m_depthStream.getProperty(purePropIdx);
+        }
+        else
+        {
+            propValue = m_depthStream.getProperty(purePropIdx);
+        }
+        return propValue;
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        bool isSet = false;
+        int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
+        if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            isSet = m_imageStream.setProperty(purePropIdx, propVal);
+        }
+        else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            isSet = m_depthStream.setProperty(purePropIdx, propVal);
+        }
+        else
+        {
+            isSet = m_depthStream.setProperty(purePropIdx, propVal);
+        }
+        return isSet;
+    }
+
+    bool grabFrame()
+    {
+        if (!isOpened())
+            return false;
+
+        bool isGrabbed = false;
+        if (m_depthStream.isValid())
+            isGrabbed = m_depthStream.grabFrame();
+        if ((m_imageStream.isValid()) && (-1 != m_imageStream.getProfileIDX()))
+            isGrabbed &= m_imageStream.grabFrame();
+
+        return isGrabbed;
+    }
+
+    virtual IplImage* retrieveFrame(int outputType)
+    {
+        IplImage* image = 0;
+        switch (outputType)
+        {
+        case CV_CAP_INTELPERC_DEPTH_MAP:
+            image = m_depthStream.retrieveDepthFrame();
+            break;
+        case CV_CAP_INTELPERC_UVDEPTH_MAP:
+            image = m_depthStream.retrieveUVFrame();
+            break;
+        case CV_CAP_INTELPERC_IR_MAP:
+            image = m_depthStream.retrieveIRFrame();
+            break;
+        case CV_CAP_INTELPERC_IMAGE:
+            image = m_imageStream.retrieveFrame();
+            break;
+        }
+        CV_Assert(NULL != image);
+        return image;
+    }
+
+    bool isOpened() const
+    {
+        return m_contextOpened;
+    }
+protected:
+    bool m_contextOpened;
+
+    PXCSmartPtr<PXCSession> m_session;
+    CvIntelPerCStreamImage m_imageStream;
+    CvIntelPerCStreamDepth m_depthStream;
+};
+
+
+CvCapture* cvCreateCameraCapture_IntelPerC(int index)
+{
+    CvCapture_IntelPerC* capture = new CvCapture_IntelPerC(index);
+
+    if( capture->isOpened() )
+        return capture;
+
+    delete capture;
+    return 0;
+}
+
+
+#endif //HAVE_INTELPERC
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index d225cb314..925cfdf40 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -128,6 +128,7 @@ CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
 CvCapture* cvCreateCameraCapture_Android( int index );
 CvCapture* cvCreateCameraCapture_XIMEA( int index );
 CvCapture* cvCreateCameraCapture_AVFoundation(int index);
+CvCapture* cvCreateCameraCapture_IntelPerC(int index);
 
 
 CVAPI(int) cvHaveImageReader(const char* filename);
diff --git a/modules/highgui/test/test_precomp.hpp b/modules/highgui/test/test_precomp.hpp
index 8468e4618..826d16574 100644
--- a/modules/highgui/test/test_precomp.hpp
+++ b/modules/highgui/test/test_precomp.hpp
@@ -35,6 +35,7 @@
     defined(HAVE_XIMEA)        || \
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_GIGE_API)     || \
+    defined(HAVE_INTELPERC)    || \
     (0)
     //defined(HAVE_ANDROID_NATIVE_CAMERA) ||   - enable after #1193
 #  define BUILD_WITH_CAMERA_SUPPORT 1
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 875813068..30c42c032 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -3299,7 +3299,10 @@ public:
                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
                     {
                         bufxy = (*m1)(Rect(x, y, bcols, brows));
-                        bufa = (*m2)(Rect(x, y, bcols, brows));
+
+                        const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
+                        for( x1 = 0; x1 < bcols; x1++ )
+                            A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
                     }
                     else if( planar_input )
                     {
@@ -3680,7 +3683,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         {
             for( x = 0; x < size.width; x++ )
             {
-                int fxy = src2 ? src2[x] : 0;
+                int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
             }
@@ -3689,7 +3692,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         {
             for( x = 0; x < size.width; x++ )
             {
-                int fxy = src2 ? src2[x] : 0;
+                int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
             }
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index c41e6336c..23ed3a9a1 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -18,6 +18,8 @@ class_ignore_list = (
 const_ignore_list = (
     "CV_CAP_OPENNI",
     "CV_CAP_PROP_OPENNI_",
+    "CV_CAP_INTELPERC",
+    "CV_CAP_PROP_INTELPERC_"
     "WINDOW_AUTOSIZE",
     "CV_WND_PROP_",
     "CV_WINDOW_",
diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl
index 02f77c224..405e48f02 100644
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -66,8 +67,8 @@ uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols,
 uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
 {
 #ifdef DISABLE_IMAGE2D
-    int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
-    int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
+    int x = clamp(round(coord.x), 0, cols - 1);
+    int y = clamp(round(coord.y), 0, rows - 1);
     return img[elemPerRow * y + x];
 #else
     return (uchar)read_imageui(img, sam, coord).x;
@@ -98,6 +99,7 @@ __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
 #define CV_PI_F 3.14159265f
 #endif
 
+
 // Use integral image to calculate haar wavelets.
 // N = 2
 // for simple haar paatern
@@ -114,10 +116,10 @@ float icvCalcHaarPatternSum_2(
 
     F d = 0;
 
-    int2 dx1 = convert_int2_rte(ratio * src[0]);
-    int2 dy1 = convert_int2_rte(ratio * src[1]);
-    int2 dx2 = convert_int2_rte(ratio * src[2]);
-    int2 dy2 = convert_int2_rte(ratio * src[3]);
+    int2 dx1 = convert_int2(round(ratio * src[0]));
+    int2 dy1 = convert_int2(round(ratio * src[1]));
+    int2 dx2 = convert_int2(round(ratio * src[2]));
+    int2 dy2 = convert_int2(round(ratio * src[3]));
 
     F t = 0;
     t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
@@ -136,106 +138,9 @@ float icvCalcHaarPatternSum_2(
     return (float)d;
 }
 
-// N = 3
-float icvCalcHaarPatternSum_3(
-    IMAGE_INT32 sumTex,
-    __constant float4 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-
-    float ratio = (float)newSize / oldSize;
-
-    F d = 0;
-
-    int4 dx1 = convert_int4_rte(ratio * src[0]);
-    int4 dy1 = convert_int4_rte(ratio * src[1]);
-    int4 dx2 = convert_int4_rte(ratio * src[2]);
-    int4 dy2 = convert_int4_rte(ratio * src[3]);
-
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
-    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
-
-    return (float)d;
-}
-
-// N = 4
-float icvCalcHaarPatternSum_4(
-    IMAGE_INT32 sumTex,
-    __constant float4 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-
-    float ratio = (float)newSize / oldSize;
-
-    F d = 0;
-
-    int4 dx1 = convert_int4_rte(ratio * src[0]);
-    int4 dy1 = convert_int4_rte(ratio * src[1]);
-    int4 dx2 = convert_int4_rte(ratio * src[2]);
-    int4 dy2 = convert_int4_rte(ratio * src[3]);
-
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
-    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
-    d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
-
-    return (float)d;
-}
-
 ////////////////////////////////////////////////////////////////////////
 // Hessian
 
-__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
-__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
-__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
-
 __inline int calcSize(int octave, int layer)
 {
     /* Wavelet size at first layer of first octave. */
@@ -250,6 +155,24 @@ __inline int calcSize(int octave, int layer)
     return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }
 
+// Calculate a derivative in an axis-aligned direction (x or y).  The "plus1"
+// boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area).
+// So the final computation is plus1a + plus1b - 2 * minus2.  The corners are
+// labeled A, B, C, and D, with A being the top left, B being top right, C
+// being bottom left, and D being bottom right.
+F calcAxisAlignedDerivative(
+        int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale,
+        int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale,
+        int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale)
+{
+    F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D;
+    F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D;
+    F minus2 = minus2_A - minus2_B - minus2_C + minus2_D;
+
+    return (plus1a / plus1a_scale -
+            2.0f * minus2 / minus2_scale +
+            plus1b / plus1b_scale);
+}
 
 //calculate targeted layer per-pixel determinant and trace with an integral image
 __kernel void icvCalcLayerDetAndTrace(
@@ -264,7 +187,7 @@ __kernel void icvCalcLayerDetAndTrace(
     int c_octave,
     int c_layer_rows,
     int sumTex_step
-)
+    )
 {
     det_step   /= sizeof(*det);
     trace_step /= sizeof(*trace);
@@ -288,16 +211,103 @@ __kernel void icvCalcLayerDetAndTrace(
 
     if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
     {
-        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
-        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
-        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        int x = j << c_octave;
+        int y = i << c_octave;
+
+        float ratio = (float)size / 9;
+
+        // Precompute some commonly used values, which are used to offset
+        // texture coordinates in the integral image.
+        int r1 = round(ratio);
+        int r2 = round(ratio * 2.0f);
+        int r3 = round(ratio * 3.0f);
+        int r4 = round(ratio * 4.0f);
+        int r5 = round(ratio * 5.0f);
+        int r6 = round(ratio * 6.0f);
+        int r7 = round(ratio * 7.0f);
+        int r8 = round(ratio * 8.0f);
+        int r9 = round(ratio * 9.0f);
+
+        // Calculate the approximated derivative in the x-direction
+        F d = 0;
+        {
+            // Some of the pixels needed to compute the derivative are
+            // repeated, so we only don't duplicate the fetch here.
+            int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step );
+            int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step );
+            int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step );
+
+            d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
+                                          t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
+                                          t32, t37, t62, t67, (r6 - r3) * (r7 - r2));
+        }
+        const float dx  = (float)d;
+
+        // Calculate the approximated derivative in the y-direction
+        d = 0;
+        {
+            // Some of the pixels needed to compute the derivative are
+            // repeated, so we only don't duplicate the fetch here.
+            int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step );
+            int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step );
+            int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step );
+            int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step );
+            int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step );
+            int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step );
+            int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step );
+            int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step );
+
+            d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
+                                          t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
+                                          t23, t26, t73, t76, (r7 - r2) * (r6 - r3));
+        }
+        const float dy  = (float)d;
+
+        // Calculate the approximated derivative in the xy-direction
+        d = 0;
+        {
+            // There's no saving us here, we just have to get all of the pixels in
+            // separate fetches
+            F t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            d += t / ((r4 - r1) * (r4 - r1));
+
+            t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step );
+            d -= t / ((r8 - r5) * (r4 - r1));
+
+            t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            d -= t / ((r4 - r1) * (r8 - r5));
+
+            t = 0;
+            t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step );
+            t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step );
+            d += t / ((r8 - r5) * (r8 - r5));
+        }
+        const float dxy = (float)d;
 
         det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
         trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
     }
 }
 
-
 ////////////////////////////////////////////////////////////////////////
 // NONMAX
 
@@ -309,10 +319,10 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
 
     float d = 0;
 
-    int dx1 = convert_int_rte(ratio * c_DM[0]);
-    int dy1 = convert_int_rte(ratio * c_DM[1]);
-    int dx2 = convert_int_rte(ratio * c_DM[2]);
-    int dy2 = convert_int_rte(ratio * c_DM[3]);
+    int dx1 = round(ratio * c_DM[0]);
+    int dy1 = round(ratio * c_DM[1]);
+    int dx2 = round(ratio * c_DM[2]);
+    int dy2 = round(ratio * c_DM[3]);
 
     float t = 0;
 
@@ -572,7 +582,7 @@ void icvFindMaximaInLayer(
 }
 
 // solve 3x3 linear system Ax=b for floating point input
-inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local  const float *b, volatile __local  float *x)
+inline bool solve3x3_float(const float4 *A, const float *b, float *x)
 {
     float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
                 - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
@@ -651,7 +661,7 @@ void icvInterpolateKeypoint(
 
     if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
     {
-        volatile __local  float dD[3];
+        float dD[3];
 
         //dx
         dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
@@ -660,7 +670,7 @@ void icvInterpolateKeypoint(
         //ds
         dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
 
-        volatile __local  float4 H[3];
+        float4 H[3];
 
         //dxx
         H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
@@ -681,7 +691,7 @@ void icvInterpolateKeypoint(
         //dss
         H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
 
-        volatile __local  float x[3];
+        float x[3];
 
         if (solve3x3_float(H, dD, x))
         {
@@ -711,7 +721,7 @@ void icvInterpolateKeypoint(
                 sampled in a circle of radius 6s using wavelets of size 4s.
                 We ensure the gradient wavelet size is even to ensure the
                 wavelet pattern is balanced and symmetric around its center */
-                const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+                const int grad_wav_size = 2 * round(2.0f * s);
 
                 // check when grad_wav_size is too big
                 if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
@@ -737,9 +747,12 @@ void icvInterpolateKeypoint(
 ////////////////////////////////////////////////////////////////////////
 // Orientation
 
-#define ORI_SEARCH_INC 5
-#define ORI_WIN        60
-#define ORI_SAMPLES    113
+#define ORI_WIN			 60
+#define ORI_SAMPLES		 113
+
+// The distance between samples in the beginning of the the reduction
+#define ORI_RESPONSE_REDUCTION_WIDTH		 48
+#define ORI_RESPONSE_ARRAY_SIZE			     (ORI_RESPONSE_REDUCTION_WIDTH * 2)
 
 __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
@@ -833,12 +846,15 @@ void icvCalcOrientation(
     __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
 
 
-    volatile __local  float s_X[128];
-    volatile __local  float s_Y[128];
-    volatile __local  float s_angle[128];
+    __local  float s_X[ORI_SAMPLES];
+    __local  float s_Y[ORI_SAMPLES];
+    __local  float s_angle[ORI_SAMPLES];
 
-    volatile __local  float s_sumx[32 * 4];
-    volatile __local  float s_sumy[32 * 4];
+    // Need to allocate enough to make the reduction work without accessing
+    // past the end of the array.
+    __local  float s_sumx[ORI_RESPONSE_ARRAY_SIZE];
+    __local  float s_sumy[ORI_RESPONSE_ARRAY_SIZE];
+    __local  float s_mod[ORI_RESPONSE_ARRAY_SIZE];
 
     /* The sampling intervals and wavelet sized for selecting an orientation
     and building the keypoint descriptor are defined relative to 's' */
@@ -849,28 +865,60 @@ void icvCalcOrientation(
     sampled in a circle of radius 6s using wavelets of size 4s.
     We ensure the gradient wavelet size is even to ensure the
     wavelet pattern is balanced and symmetric around its center */
-    const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+    const int grad_wav_size = 2 * round(2.0f * s);
 
     // check when grad_wav_size is too big
     if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
         return;
 
     // Calc X, Y, angle and store it to shared memory
-    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    const int tid = get_local_id(0);
+    // Initialize values that are only used as part of the reduction later.
+    if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) {
+        s_mod[tid + ORI_LOCAL_SIZE] = 0.0f;
+    }
 
-    float X = 0.0f, Y = 0.0f, angle = 0.0f;
+    float ratio = (float)grad_wav_size / 4;
 
-    if (tid < ORI_SAMPLES)
+    int r2 = round(ratio * 2.0);
+    int r4 = round(ratio * 4.0);
+    for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
     {
+        float X = 0.0f, Y = 0.0f, angle = 0.0f;
         const float margin = (float)(grad_wav_size - 1) / 2.0f;
-        const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
-        const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
+        const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
+        const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
 
         if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
-                x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
+            x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
         {
-            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
-            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+
+            float apt = c_aptW[i];
+
+            // Compute the haar sum without fetching duplicate pixels.
+            float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step);
+            float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step);
+            float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step);
+            float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step);
+            float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step);
+            float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step);
+            float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step);
+            float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step);
+
+            F t = t00 - t04 - t20 + t24;
+            X -= t / ((r2) * (r4));
+
+            t = t20 - t24 - t40 + t44;
+            X += t / ((r4 - r2) * (r4));
+
+            t = t00 - t02 - t40 + t42;
+            Y += t / ((r2) * (r4));
+
+            t = t02 - t04 - t42 + t44;
+            Y -= t  / ((r4) * (r4 - r2));
+
+            X = apt*X;
+            Y = apt*Y;
 
             angle = atan2(Y, X);
 
@@ -879,76 +927,61 @@ void icvCalcOrientation(
             angle *= 180.0f / CV_PI_F;
 
         }
+
+        s_X[i] = X;
+        s_Y[i] = Y;
+        s_angle[i] = angle;
     }
-    s_X[tid] = X;
-    s_Y[tid] = Y;
-    s_angle[tid] = angle;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     float bestx = 0, besty = 0, best_mod = 0;
+    float sumx = 0.0f, sumy = 0.0f;
+    const int dir = tid * ORI_SEARCH_INC;
+    #pragma unroll
+    for (int i = 0; i < ORI_SAMPLES; ++i) {
+        int angle = round(s_angle[i]);
 
-#pragma unroll
-    for (int i = 0; i < 18; ++i)
-    {
-        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
+        int d = abs(angle - dir);
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+        {
+            sumx += s_X[i];
+            sumy += s_Y[i];
+        }
+    }
+    s_sumx[tid] = sumx;
+    s_sumy[tid] = sumy;
+    s_mod[tid] = sumx*sumx + sumy*sumy;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        volatile float sumx = 0.0f, sumy = 0.0f;
-        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx = s_X[get_local_id(0)];
-            sumy = s_Y[get_local_id(0)];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 32];
-            sumy += s_Y[get_local_id(0) + 32];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 64];
-            sumy += s_Y[get_local_id(0) + 64];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 96];
-            sumy += s_Y[get_local_id(0) + 96];
-        }
-        reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
-        reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
-
-        const float temp_mod = sumx * sumx + sumy * sumy;
-        if (temp_mod > best_mod)
-        {
-            best_mod = temp_mod;
-            bestx = sumx;
-            besty = sumy;
+    // This reduction searches for the longest wavelet response vector.  The first
+    // step uses all of the work items in the workgroup to narrow the search
+    // down to the three candidates.  It requires s_mod to have a few more
+    // elements alocated past the work-group size, which are pre-initialized to
+    // 0.0f above.
+    for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) {
+        if (tid < t) {
+            if (s_mod[tid] < s_mod[tid + t]) {
+                s_mod[tid] = s_mod[tid + t];
+                s_sumx[tid] = s_sumx[tid + t];
+                s_sumy[tid] = s_sumy[tid + t];
+            }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    if (get_local_id(0) == 0)
-    {
-        s_X[get_local_id(1)] = bestx;
-        s_Y[get_local_id(1)] = besty;
-        s_angle[get_local_id(1)] = best_mod;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (get_local_id(1) == 0 && get_local_id(0) == 0)
+    // Do the final reduction and write out the result.
+    if (tid == 0)
     {
         int bestIdx = 0;
 
-        if (s_angle[1] > s_angle[bestIdx])
+        // The loop above narrowed the search of the longest vector to three
+        // possibilities.  Pick the best here.
+        if (s_mod[1] > s_mod[bestIdx])
             bestIdx = 1;
-        if (s_angle[2] > s_angle[bestIdx])
+        if (s_mod[2] > s_mod[bestIdx])
             bestIdx = 2;
-        if (s_angle[3] > s_angle[bestIdx])
-            bestIdx = 3;
 
-        float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
+        float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]);
         if (kp_dir < 0)
             kp_dir += 2.0f * CV_PI_F;
         kp_dir *= 180.0f / CV_PI_F;
@@ -961,7 +994,6 @@ void icvCalcOrientation(
     }
 }
 
-
 __kernel
 void icvSetUpright(
     __global float * keypoints,
@@ -1035,8 +1067,8 @@ inline float linearFilter(
 
     float out = 0.0f;
 
-    const int x1 = convert_int_rtn(x);
-    const int y1 = convert_int_rtn(y);
+    const int x1 = round(x);
+    const int y1 = round(y);
     const int x2 = x1 + 1;
     const int y2 = y1 + 1;
 
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index 5ade5e517..8fd717c6c 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -46,6 +46,7 @@
 
 #ifdef HAVE_OPENCV_OCL
 #include <cstdio>
+#include <sstream>
 #include "opencl_kernels.hpp"
 
 using namespace cv;
@@ -57,18 +58,25 @@ namespace cv
 {
     namespace ocl
     {
+        // The number of degrees between orientation samples in calcOrientation
+        const static int ORI_SEARCH_INC = 5;
+        // The local size of the calcOrientation kernel
+        const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC);
+
         static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
             size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
         {
-            char optBuf [100] = {0};
-            char * optBufPtr = optBuf;
+            std::stringstream optsStr;
+            optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
+            optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
             cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
             size_t wave_size = queryWaveFrontSize(kernel);
             CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
-            sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
-            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
+            optsStr << "-D WAVE_SIZE=" << wave_size;
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
         }
+
     }
 }
 
@@ -601,8 +609,8 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
 
-    size_t localThreads[3]  = {32, 4, 1};
-    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
+    size_t localThreads[3]  = {ORI_LOCAL_SIZE, 1, 1};
+    size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1};
 
     openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
 }
diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst
index 33f1b2796..6fbc19a71 100644
--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU
 ----------------------------------------
 Creates a separable linear filter engine.
 
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
 
     :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
 
@@ -303,6 +303,8 @@ Creates a separable linear filter engine.
 
     :param bordertype: Pixel extrapolation method.
 
+    :param imgSize: Source image size to choose optimal method for processing.
+
 .. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
 
 
@@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU
 ------------------------------
 Creates a filter engine for the generalized Sobel operator.
 
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
 
     :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
 
@@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator.
 
     :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
 
+    :param imgSize: Source image size to choose optimal method for processing.
+
 .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
 
 
@@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU
 ---------------------------------
 Creates a Gaussian filter engine.
 
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
 
     :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
 
@@ -417,6 +421,8 @@ Creates a Gaussian filter engine.
 
     :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
 
+    :param imgSize: Source image size to choose optimal method for processing.
+
 .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
 
 ocl::GaussianBlur
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index 542dbeb0b..bc838924a 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -695,17 +695,17 @@ namespace cv
 
         //! returns the separable linear filter engine
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
-                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
 
         //! returns the separable filter engine with the specified filters
         CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
                 const Ptr<BaseColumnFilter_GPU> &columnFilter);
 
         //! returns the Gaussian filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
 
         //! returns filter engine for the generalized Sobel operator
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) );
 
         //! applies Laplacian operator to the image
         // supports only ksize = 1 and ksize = 3
@@ -1439,8 +1439,10 @@ namespace cv
             oclMat Dx_;
             oclMat Dy_;
             oclMat eig_;
+            oclMat eig_minmax_;
             oclMat minMaxbuf_;
             oclMat tmpCorners_;
+            oclMat counter_;
         };
 
         inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
diff --git a/modules/ocl/src/color.cpp b/modules/ocl/src/color.cpp
index f71081d78..408ba4cce 100644
--- a/modules/ocl/src/color.cpp
+++ b/modules/ocl/src/color.cpp
@@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
 {
     int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+    int pixels_per_work_item = 1;
 
-    String build_options = format("-D DEPTH_%d", src.depth());
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+    {
+        if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
+            pixels_per_work_item =  4;
+        else if (src.cols % 2 == 0)
+            pixels_per_work_item =  2;
+        else
+            pixels_per_work_item =  1;
+    }
+
+    String build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item);
     if (!additionalOptions.empty())
         build_options = build_options + additionalOptions;
 
@@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
     if (!data2.empty())
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
 
+    size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                           const std::string & additionalOptions = std::string(),
+                           const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat())
+{
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+
+    std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data1.empty())
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data1.data ));
+    if (!data2.empty())
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
+
+   size_t gt[3] = { dst.cols, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
+{
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data.empty())
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
+
     size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
     size_t lt[3] = { 16, 10, 1 };
@@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
 static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                          const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
-    String build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+    int pixels_per_work_item = 1;
+
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+    {
+        if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
+            pixels_per_work_item =  4;
+        else if (src.cols % 2 == 0)
+            pixels_per_work_item =  2;
+        else
+            pixels_per_work_item =  1;
+    }
+
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
+    if (!data.empty())
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
+
+    size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
+{
+    String build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
     if (!additionalOptions.empty())
         build_options = build_options + additionalOptions;
 
@@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -119,10 +238,13 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
     openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }
 
-static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
+static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
+                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
-    String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
-                                  dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
+    if (!additionalOptions.empty())
+        build_options += additionalOptions;
+
     int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
 
@@ -136,6 +258,36 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
 
+    if (!data.empty())
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
+
+    size_t gt[3] = { dst.cols, dst.rows, 1 };
+#ifdef ANDROID
+    size_t lt[3] = { 16, 10, 1 };
+#else
+    size_t lt[3] = { 16, 16, 1 };
+#endif
+    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
+}
+
+static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
+{
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
+    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
+
+    String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s",
+                                  src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
+    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
+
     size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
     size_t lt[3] = { 16, 10, 1 };
@@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
 
 static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
 {
-    String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
-                                  src.depth(), greenbits, dst.channels());
+    String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d",
+                                  src.depth(), greenbits, dst.channels(), bidx);
     int src_offset = src.offset >> 1, src_step = src.step >> 1;
     int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();
 
@@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
 
 static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
 {
-    String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
-                                  src.depth(), greenbits, src.channels());
+    String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d",
+                                  src.depth(), greenbits, src.channels(), bidx);
     int src_offset = (int)src.offset, src_step = (int)src.step;
     int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;
 
@@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
         CV_Assert(scn == 1);
         dcn  = code == COLOR_GRAY2BGRA ? 4 : 3;
         dst.create(sz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, 0, "Gray2RGB");
+        fromGray_caller(src, dst, 0, "Gray2RGB");
         break;
     }
     case COLOR_BGR2YUV: case COLOR_RGB2YUV:
@@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
 
         Size dstSz(sz.width, sz.height * 2 / 3);
         dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
+        toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12");
         break;
     }
     case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
@@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
                 initialized = true;
             }
 
-            fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
+            toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
             return;
         }
 
-        fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
+        toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
         break;
     }
     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
         dst.create(sz, CV_MAKETYPE(depth, dcn));
 
         std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
-        toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
+        fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
         break;
     }
     case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index 8832b305d..b6e1fff4e 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -741,6 +741,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke
     f->apply(src, dst);
 }
 
+const int optimizedSepFilterLocalSize = 16;
+static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst,
+                                   const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT)
+{
+    size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1};
+    size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
+
+    unsigned int src_pitch = src.step;
+    unsigned int dst_pitch = dst.step;
+
+    int src_offset_x = (src.offset % src.step) / src.elemSize();
+    int src_offset_y = src.offset / src.step;
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
+    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src_offset_x ));
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src_offset_y ));
+
+    args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&dst.data ));
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.offset ));
+    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
+
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholecols ));
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholerows ));
+
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.rows ));
+
+    String option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1],
+        row_kernel.rows / 2, col_kernel.rows / 2 );
+
+    option += " -D KERNEL_MATRIX_X=";
+    for(int i=0; i<row_kernel.rows; i++)
+        option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &row_kernel.at<float>(i) ) );
+    option += "0x0";
+
+    option += " -D KERNEL_MATRIX_Y=";
+    for(int i=0; i<col_kernel.rows; i++)
+        option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &col_kernel.at<float>(i) ) );
+    option += "0x0";
+
+    switch(src.type())
+    {
+    case CV_8UC1:
+        option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float";
+        break;
+    case CV_32FC1:
+        option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float";
+        break;
+    case CV_8UC2:
+        option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2";
+        break;
+    case CV_32FC2:
+        option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2";
+        break;
+    case CV_8UC3:
+        option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3";
+        break;
+    case CV_32FC3:
+        option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3";
+        break;
+    case CV_8UC4:
+        option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4";
+        break;
+    case CV_32FC4:
+        option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4";
+        break;
+    default:
+        CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
+        break;
+    }
+    switch(dst.type())
+    {
+    case CV_8UC1:
+        option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat";
+        break;
+    case CV_8UC2:
+        option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat";
+        break;
+    case CV_8UC3:
+        option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat";
+        break;
+    case CV_8UC4:
+        option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat";
+        break;
+    case CV_32FC1:
+        option += " -D DSTTYPE=float -D CONVERT_DSTTYPE=";
+        break;
+    case CV_32FC2:
+        option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE=";
+        break;
+    case CV_32FC3:
+        option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE=";
+        break;
+    case CV_32FC4:
+        option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE=";
+        break;
+    default:
+        CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
+        break;
+    }
+    switch(bordertype)
+    {
+    case cv::BORDER_CONSTANT:
+        option += " -D BORDER_CONSTANT";
+        break;
+    case cv::BORDER_REPLICATE:
+        option += " -D BORDER_REPLICATE";
+        break;
+    case cv::BORDER_REFLECT:
+        option += " -D BORDER_REFLECT";
+        break;
+    case cv::BORDER_REFLECT101:
+        option += " -D BORDER_REFLECT_101";
+        break;
+    case cv::BORDER_WRAP:
+        option += " -D BORDER_WRAP";
+        break;
+    default:
+        CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
+        break;
+    }
+
+    openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args,
+        -1, -1, option.c_str() );
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // SeparableFilter
 
@@ -790,6 +919,35 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
     return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
 }
 
+namespace
+{
+class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU
+{
+public:
+    SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype )
+    {
+        bordertype = btype;
+        rowKernel = rowKernel_;
+        columnKernel = columnKernel_;
+    }
+
+    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
+    {
+        normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size());
+
+        oclMat srcROI = src(roi);
+        oclMat dstROI = dst(roi);
+
+        sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype);
+    }
+
+    Mat rowKernel;
+    Mat columnKernel;
+    int bordertype;
+};
+}
+
+
 static void GPUFilterBox(const oclMat &src, oclMat &dst,
                          Size &ksize, const Point anchor, const int borderType)
 {
@@ -1243,17 +1401,32 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
 }
 
 Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType,
-        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype)
+        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize )
 {
     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
     int cn = CV_MAT_CN(srcType);
     int bdepth = std::max(std::max(sdepth, ddepth), CV_32F);
     int bufType = CV_MAKETYPE(bdepth, cn);
+    Context* clCxt = Context::getContext();
 
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
+    //if image size is non-degenerate and large enough
+    //and if filter support is reasonable to satisfy larger local memory requirements,
+    //then we can use single pass routine to avoid extra runtime calls overhead
+    if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) &&
+        rowKernel.rows <= 21 && columnKernel.rows <= 21 &&
+        (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 &&
+        imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) &&
+        imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) )
+    {
+        return Ptr<FilterEngine_GPU>(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype));
+    }
+    else
+    {
+        Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
+        Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
 
-    return createSeparableFilter_GPU(rowFilter, columnFilter);
+        return createSeparableFilter_GPU(rowFilter, columnFilter);
+    }
 }
 
 void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
@@ -1277,16 +1450,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
 
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
+    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size());
     f->apply(src, dst);
 }
 
-Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
+Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize )
 {
     Mat kx, ky;
     getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
     return createSeparableLinearFilter_GPU(srcType, dstType,
-                                           kx, ky, Point(-1, -1), 0, borderType);
+                                           kx, ky, Point(-1, -1), 0, borderType, imgSize);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1356,7 +1529,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
-Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype)
+Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize)
 {
     int depth = CV_MAT_DEPTH(type);
 
@@ -1383,7 +1556,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
     else
         ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
 
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
+    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize);
 }
 
 void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
@@ -1419,7 +1592,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
 
     dst.create(src.size(), src.type());
 
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
+    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size());
     f->apply(src, dst);
 }
 
diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp
index b07286553..09cd2a192 100644
--- a/modules/ocl/src/gftt.cpp
+++ b/modules/ocl/src/gftt.cpp
@@ -48,154 +48,142 @@
 using namespace cv;
 using namespace cv::ocl;
 
+// currently sort procedure on the host is more efficient
 static bool use_cpu_sorter = true;
 
-namespace
+// compact structure for corners
+struct DefCorner
 {
-enum SortMethod
+    float eig;  //eigenvalue of corner
+    short x;    //x coordinate of corner point
+    short y;    //y coordinate of corner point
+} ;
+
+// compare procedure for corner
+//it is used for sort on the host side
+struct DefCornerCompare
 {
-    CPU_STL,
-    BITONIC,
-    SELECTION
-};
-
-const int GROUP_SIZE = 256;
-
-template<SortMethod method>
-struct Sorter
-{
-    //typedef EigType;
-};
-
-//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
-template<>
-struct Sorter<CPU_STL>
-{
-    typedef oclMat EigType;
-    static cv::Mutex cs;
-    static Mat mat_eig;
-
-    //prototype
-    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
+    bool operator()(const DefCorner a, const DefCorner b) const
     {
-        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
-        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
-        return v1 > v2;
-    }
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        cv::AutoLock lock(cs);
-        //temporarily use STL's sort function
-        Mat mat_corners = corners;
-        mat_eig = eig_tex;
-        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
-        corners = mat_corners;
+        return a.eig > b.eig;
     }
 };
-cv::Mutex Sorter<CPU_STL>::cs;
-cv::Mat   Sorter<CPU_STL>::mat_eig;
 
-template<>
-struct Sorter<BITONIC>
+// sort corner point using opencl bitonicosrt implementation
+static void sortCorners_caller(oclMat& corners, const int count)
 {
-    typedef TextureCL EigType;
+    Context * cxt = Context::getContext();
+    int     GS = count/2;
+    int     LS = min(255,GS);
+    size_t  globalThreads[3] = {GS, 1, 1};
+    size_t  localThreads[3]  = {LS, 1, 1};
 
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    // 2^numStages should be equal to count or the output is invalid
+    int numStages = 0;
+    for(int i = count; i > 1; i >>= 1)
     {
-        Context * cxt = Context::getContext();
-        size_t globalThreads[3] = {count / 2, 1, 1};
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-
-        // 2^numStages should be equal to count or the output is invalid
-        int numStages = 0;
-        for(int i = count; i > 1; i >>= 1)
+        ++numStages;
+    }
+    const int argc = 4;
+    std::vector< std::pair<size_t, const void *> > args(argc);
+    std::string kernelname = "sortCorners_bitonicSort";
+    args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+    args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
+    for(int stage = 0; stage < numStages; ++stage)
+    {
+        args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
+        for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
         {
-            ++numStages;
-        }
-        const int argc = 5;
-        std::vector< std::pair<size_t, const void *> > args(argc);
-        String kernelname = "sortCorners_bitonicSort";
-        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
-        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
-        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
-        for(int stage = 0; stage < numStages; ++stage)
-        {
-            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
-            for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
-            {
-                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
-                openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-            }
+            args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+            openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
         }
     }
-};
+}
 
-template<>
-struct Sorter<SELECTION>
-{
-    typedef TextureCL EigType;
-
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        Context * cxt = Context::getContext();
-
-        size_t globalThreads[3] = {count, 1, 1};
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-
-        std::vector< std::pair<size_t, const void *> > args;
-        //local
-        String kernelname = "sortCorners_selectionSortLocal";
-        int lds_size = GROUP_SIZE * sizeof(cl_float2);
-        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
-        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
-        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
-        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
-
-        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-
-        //final
-        kernelname = "sortCorners_selectionSortFinal";
-        args.pop_back();
-        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-    }
-};
-
-int findCorners_caller(
-    const TextureCL& eig,
-    const float threshold,
-    const oclMat& mask,
-    oclMat& corners,
-    const int max_count)
+// find corners on matrix and put it into array
+static void findCorners_caller(
+    const oclMat&   eig_mat,        //input matrix worth eigenvalues
+    oclMat&         eigMinMax,      //input with min and max values of eigenvalues
+    const float     qualityLevel,
+    const oclMat&   mask,
+    oclMat&         corners,        //output array with detected corners
+    oclMat&         counter)        //output value with number of detected corners, have to be 0 before call
 {
+    String  opt;
     std::vector<int> k;
     Context * cxt = Context::getContext();
 
     std::vector< std::pair<size_t, const void*> > args;
-    String kernelname = "findCorners";
 
     const int mask_strip = mask.step / mask.elemSize1();
 
-    oclMat g_counter(1, 1, CV_32SC1);
-    g_counter.setTo(0);
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&(eig_mat.data)));
 
-    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eig  ));
+    int src_pitch = (int)eig_mat.step;
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&src_pitch ));
     args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&mask.data ));
     args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&corners.data ));
     args.push_back(std::make_pair( sizeof(cl_int),   (void*)&mask_strip));
-    args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
-    args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eigMinMax.data ));
+    args.push_back(std::make_pair( sizeof(cl_float), (void*)&qualityLevel ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&eig_mat.rows ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&eig_mat.cols ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&corners.cols ));
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&counter.data ));
 
-    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+    size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
     size_t localThreads[3]  = {16, 16, 1};
+    if(!mask.empty())
+        opt += " -D WITH_MASK=1";
 
-    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
-    openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
-    return std::min(Mat(g_counter).at<int>(0), max_count);
+     openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
+}
+
+
+static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
+{
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
+    CV_Assert(groupnum != 0);
+
+    int dbsize = groupnum * 2 * src.elemSize();
+
+    ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
+
+    cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
+
+    int all_cols = src.step / src.elemSize();
+    int pre_cols = (src.offset % src.step) / src.elemSize();
+    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
+    int invalid_cols = pre_cols + sec_cols;
+    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
+    int offset = src.offset / src.elemSize();
+
+    {// first parallel pass
+        std::vector<std::pair<size_t , const void *> > args;
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
+        size_t globalThreads[3] = {groupnum * 256, 1, 1};
+        size_t localThreads[3] = {256, 1, 1};
+        openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
+                            args, -1, -1, "-D T=float -D DEPTH_5");
+    }
+
+    {// run final "serial" kernel to find accumulate results from threads and reset corner counter
+        std::vector<std::pair<size_t , const void *> > args;
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum ));
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
+        size_t globalThreads[3] = {1, 1, 1};
+        size_t localThreads[3] = {1, 1, 1};
+        openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
+                            args, -1, -1);
+    }
 }
-}//unnamed namespace
 
 void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
 {
@@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
     ensureSizeIsEnough(image.size(), CV_32F, eig_);
 
     if (useHarrisDetector)
-        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+        cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
     else
         cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
 
-    double maxVal = 0;
-    minMax(eig_, NULL, &maxVal);
+    ensureSizeIsEnough(1,1, CV_32SC1, counter_);
 
-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+    // find max eigenvalue and reset detected counters
+    minMaxEig_caller(eig_,eig_minmax_,counter_);
 
-    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
-    int total = findCorners_caller(
-        *eig_tex,
-        static_cast<float>(maxVal * qualityLevel),
+    // allocate buffer for kernels
+    int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
+
+    if(!use_cpu_sorter)
+    {   // round to 2^n
+        unsigned int n=1;
+        for(n=1;n<(unsigned int)corner_array_size;n<<=1);
+        corner_array_size = (int)n;
+
+        ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
+
+        // set to 0 to be able use bitonic sort on whole 2^n array
+        tmpCorners_.setTo(0);
+    }
+    else
+    {
+        ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
+    }
+
+    int total = tmpCorners_.cols; // by default the number of corner is full array
+    std::vector<DefCorner>   tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
+
+    //find points with high eigenvalue and put it into the output array
+    findCorners_caller(
+        eig_,
+        eig_minmax_,
+        static_cast<float>(qualityLevel),
         mask,
         tmpCorners_,
-        tmpCorners_.cols);
+        counter_);
+
+    if(!use_cpu_sorter)
+    {// sort detected corners on deivce side
+        sortCorners_caller(tmpCorners_, corner_array_size);
+    }
+    else
+    {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
+        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
+    }
+
+    //blocking read whole corners array (sorted or not sorted)
+    openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
 
     if (total == 0)
-    {
+    {// check for trivial case
         corners.release();
         return;
     }
+
     if(use_cpu_sorter)
-    {
-        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
-    }
-    else
-    {
-        //if total is power of 2
-        if(((total - 1) & (total)) == 0)
-        {
-            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
-        }
-        else
-        {
-            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
-        }
+    {// sort detected corners on cpu side.
+        tmp.resize(total);
+        std::sort(tmp.begin(), tmp.end(), DefCornerCompare());
     }
 
+    //estimate maximal size of final output array
+    int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
+    int D2 = (int)ceil(minDistance * minDistance);
+    // allocate output buffer
+    std::vector<Point2f> tmp2;
+    tmp2.reserve(total_max);
+
+
     if (minDistance < 1)
-    {
-        Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
-        tmpCorners_(roi_range).copyTo(corners);
+    {// we have not distance restriction. then just copy with conversion maximal allowed points into output array
+        for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
+        {
+            tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
+        }
     }
     else
-    {
-        std::vector<Point2f> tmp(total);
-        downloadPoints(tmpCorners_, tmp);
-
-        std::vector<Point2f> tmp2;
-        tmp2.reserve(total);
-
+    {// we have distance restriction. then start coping to output array from the first element and check distance for each next one
         const int cell_size = cvRound(minDistance);
         const int grid_width = (image.cols + cell_size - 1) / cell_size;
         const int grid_height = (image.rows + cell_size - 1) / cell_size;
 
-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+        std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
 
-        for (int i = 0; i < total; ++i)
+        for (int i = 0; i < total ; ++i)
         {
-            Point2f p = tmp[i];
+            DefCorner p = tmp[i];
+
+            if(p.eig<=0.0f)
+                break; // condition to stop that is needed for GPU bitonic sort usage.
 
             bool good = true;
 
@@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
             {
                 for (int xx = x1; xx <= x2; xx++)
                 {
-                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
-
-                    if (!m.empty())
+                    std::vector<Point2i>& m = grid[yy * grid_width + xx];
+                    if (m.empty())
+                        continue;
+                    for(size_t j = 0; j < m.size(); j++)
                     {
-                        for(size_t j = 0; j < m.size(); j++)
-                        {
-                            float dx = p.x - m[j].x;
-                            float dy = p.y - m[j].y;
+                        int dx = p.x - m[j].x;
+                        int dy = p.y - m[j].y;
 
-                            if (dx * dx + dy * dy < minDistance * minDistance)
-                            {
-                                good = false;
-                                goto break_out;
-                            }
+                        if (dx * dx + dy * dy < D2)
+                        {
+                            good = false;
+                            goto break_out_;
                         }
                     }
                 }
             }
 
-            break_out:
+            break_out_:
 
             if(good)
             {
-                grid[y_cell * grid_width + x_cell].push_back(p);
+                grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
 
-                tmp2.push_back(p);
+                tmp2.push_back(Point2f(p.x,p.y));
 
                 if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
                     break;
             }
         }
 
-        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
     }
+    int final_size = static_cast<int>(tmp2.size());
+    if(final_size>0)
+        corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
+    else
+        corners.release();
 }
 void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
 {
diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
index a023f8a04..d38b3bad9 100644
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -866,16 +866,17 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
 
         if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
         {
-            //setup local group size
-            localThreads[0] = 8;
-            localThreads[1] = 16;
+            //setup local group size for "pixel step" = 1
+            localThreads[0] = 16;
+            localThreads[1] = 32;
             localThreads[2] = 1;
 
-            //init maximal number of workgroups
+            //calc maximal number of workgroups
             int WGNumX = 1+(sizev[0].width /(localThreads[0]));
             int WGNumY = 1+(sizev[0].height/(localThreads[1]));
             int WGNumZ = loopcount;
-            int WGNum = 0; //accurate number of non -empty workgroups
+            int WGNumTotal = 0; //accurate number of non-empty workgroups
+            int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
             oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
             {
                 cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
@@ -895,12 +896,16 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
                             if(gx>=(Width-cascade->orig_window_size.width))
                                 continue; // no data to process
 
+                            if(scaleinfo[z].factor<=2)
+                            {
+                                WGNumSampled++;
+                            }
                             // save no-empty workgroup info into array
-                            pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
-                            pWGInfo[WGNum].s[1] = (gx << 16) | gy;
-                            pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
-                            memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
-                            WGNum++;
+                            pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height;
+                            pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy;
+                            pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff;
+                            memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float));
+                            WGNumTotal++;
                         }
                     }
                 }
@@ -908,13 +913,8 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
                 pWGInfo = NULL;
             }
 
-            // setup global sizes to have linear array of workgroups with WGNum size
-            globalThreads[0] = localThreads[0]*WGNum;
-            globalThreads[1] = localThreads[1];
-            globalThreads[2] = 1;
-
 #define NODE_SIZE 12
-            // pack node info to have less memory loads
+            // pack node info to have less memory loads on the device side
             oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
             {
                 cl_int  status;
@@ -963,8 +963,6 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
             options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
             options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
             options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
-            options += format(" -D LSx=%d",localThreads[0]);
-            options += format(" -D LSy=%d",localThreads[1]);
             options += format(" -D SPLITNODE=%d",splitnode);
             options += format(" -D SPLITSTAGE=%d",splitstage);
             options += format(" -D OUTPUTSZ=%d",outputsz);
@@ -972,8 +970,39 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
             // init candiate global count by 0
             int pattern = 0;
             openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
-            // execute face detector
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+
+            if(WGNumTotal>WGNumSampled)
+            {// small images and each pixel is processed
+                // setup global sizes to have linear array of workgroups with WGNum size
+                int     pixelstep = 1;
+                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
+                globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled);
+                globalThreads[1] = LS[1];
+                globalThreads[2] = 1;
+                String options1 = options;
+                options1 += format(" -D PIXEL_STEP=%d",pixelstep);
+                options1 += format(" -D WGSTART=%d",WGNumSampled);
+                options1 += format(" -D LSx=%d",LS[0]);
+                options1 += format(" -D LSy=%d",LS[1]);
+                // execute face detector
+                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str());
+            }
+            if(WGNumSampled>0)
+            {// large images each 4th pixel is processed
+                // setup global sizes to have linear array of workgroups with WGNum size
+                int     pixelstep = 2;
+                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
+                globalThreads[0] = LS[0]*WGNumSampled;
+                globalThreads[1] = LS[1];
+                globalThreads[2] = 1;
+                String options2 = options;
+                options2 += format(" -D PIXEL_STEP=%d",pixelstep);
+                options2 += format(" -D WGSTART=%d",0);
+                options2 += format(" -D LSx=%d",LS[0]);
+                options2 += format(" -D LSy=%d",LS[1]);
+                // execute face detector
+                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str());
+            }
             //read candidate buffer back and put it into host list
             openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
             assert(candidate[0]<outputsz);
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index 70fe99187..3bdb382fb 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -76,6 +76,11 @@ namespace cv
                 int cdescr_width;
                 int cdescr_height;
 
+                // A shift value and type that allows qangle to be different
+                // sizes on different hardware
+                int qangle_step_shift;
+                int qangle_type;
+
                 void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                                       int nblocks_win_x, int nblocks_win_y);
 
@@ -153,6 +158,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
         hog_device_cpu = true;
     else
         hog_device_cpu = false;
+
 }
 
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -213,7 +219,7 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
         effect_size = img.size();
 
     grad.create(img.size(), CV_32FC2);
-    qangle.create(img.size(), CV_8UC2);
+    qangle.create(img.size(), hog::qangle_type);
 
     const size_t block_hist_size = getBlockHistogramSize();
     const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
@@ -1606,6 +1612,16 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
 
     int descr_size = descr_width * nblocks_win_y;
     cdescr_size = descr_size;
+
+    qangle_type = CV_8UC2;
+    qangle_step_shift = 0;
+    // Some Intel devices have low single-byte access performance,
+    // so we change the datatype here.
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+    {
+        qangle_type = CV_32SC2;
+        qangle_step_shift = 2;
+    }
 }
 
 void cv::ocl::device::hog::compute_hists(int nbins,
@@ -1627,7 +1643,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
     int blocks_total = img_block_width * img_block_height;
 
     int grad_quadstep = grad.step >> 2;
-    int qangle_step = qangle.step;
+    int qangle_step = qangle.step >> qangle_step_shift;
 
     int blocks_in_group = 4;
     size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
@@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
     char correctGamma = (correct_gamma) ? 1 : 0;
     int img_step = img.step;
     int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
+    int qangle_step = qangle.step >> (1 + qangle_step_shift);
 
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
@@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
     char correctGamma = (correct_gamma) ? 1 : 0;
     int img_step = img.step >> 2;
     int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
+    int qangle_step = qangle.step >> (1 + qangle_step_shift);
 
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index f730df10f..0ac627172 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -1035,67 +1035,117 @@ namespace cv
             else
                 scale = 1. / scale;
 
-            if (ksize > 0)
+            const int sobel_lsz = 16;
+            if((src.type() == CV_8UC1 || src.type() == CV_32FC1) &&
+                (ksize==3 || ksize==5 || ksize==7 || ksize==-1) &&
+                src.wholerows > sobel_lsz + (ksize>>1) &&
+                src.wholecols > sobel_lsz + (ksize>>1))
             {
-                Context* clCxt = Context::getContext();
-                if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
-                    src.cols % 8 == 0 && src.rows % 8 == 0 &&
-                    ksize==3 &&
-                    (borderType ==cv::BORDER_REFLECT ||
-                     borderType == cv::BORDER_REPLICATE ||
-                     borderType ==cv::BORDER_REFLECT101 ||
-                     borderType ==cv::BORDER_WRAP))
+                Dx.create(src.size(), CV_32FC1);
+                Dy.create(src.size(), CV_32FC1);
+
+                CV_Assert(Dx.rows == Dy.rows && Dx.cols == Dy.cols);
+
+                size_t lt2[3] = {sobel_lsz, sobel_lsz, 1};
+                size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
+
+                unsigned int src_pitch = src.step;
+                unsigned int Dx_pitch = Dx.step;
+                unsigned int Dy_pitch = Dy.step;
+
+                int src_offset_x = (src.offset % src.step) / src.elemSize();
+                int src_offset_y = src.offset / src.step;
+
+                float _scale = scale;
+
+                std::vector<std::pair<size_t , const void *> > args;
+                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
+                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x ));
+                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y ));
+
+                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&Dx.data ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.offset ));
+                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dx_pitch ));
+                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&Dy.data ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dy.offset ));
+                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dy_pitch ));
+
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholecols ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholerows ));
+
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.cols ));
+                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.rows ));
+
+                args.push_back( std::make_pair( sizeof(cl_float), (void *)&_scale ));
+
+                String option = cv::format("-D BLK_X=%d -D BLK_Y=%d",(int)lt2[0],(int)lt2[1]);
+                switch(src.type())
                 {
-                    Dx.create(src.size(), CV_32FC1);
-                    Dy.create(src.size(), CV_32FC1);
-
-                    const unsigned int block_x = 8;
-                    const unsigned int block_y = 8;
-
-                    unsigned int src_pitch = src.step;
-                    unsigned int dst_pitch = Dx.cols;
-
-                    float _scale = scale;
-
-                    std::vector<std::pair<size_t , const void *> > args;
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
-                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
-                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
-                    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
-                    size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
-
-                    String option = "-D BLK_X=8 -D BLK_Y=8";
-                    switch(borderType)
-                    {
-                    case cv::BORDER_REPLICATE:
-                        option += " -D BORDER_REPLICATE";
-                        break;
-                    case cv::BORDER_REFLECT:
-                        option += " -D BORDER_REFLECT";
-                        break;
-                    case cv::BORDER_REFLECT101:
-                        option += " -D BORDER_REFLECT101";
-                        break;
-                    case cv::BORDER_WRAP:
-                        option += " -D BORDER_WRAP";
-                        break;
-                    }
-                    openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
+                case CV_8UC1:
+                    option += " -D SRCTYPE=uchar";
+                    break;
+                case CV_32FC1:
+                    option += " -D SRCTYPE=float";
+                    break;
                 }
-                else
+                switch(borderType)
+                {
+                case cv::BORDER_CONSTANT:
+                    option += " -D BORDER_CONSTANT";
+                    break;
+                case cv::BORDER_REPLICATE:
+                    option += " -D BORDER_REPLICATE";
+                    break;
+                case cv::BORDER_REFLECT:
+                    option += " -D BORDER_REFLECT";
+                    break;
+                case cv::BORDER_REFLECT101:
+                    option += " -D BORDER_REFLECT_101";
+                    break;
+                case cv::BORDER_WRAP:
+                    option += " -D BORDER_WRAP";
+                    break;
+                default:
+                    CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
+                    break;
+                }
+
+                String kernel_name;
+                switch(ksize)
+                {
+                case -1:
+                    option += " -D SCHARR";
+                    kernel_name = "sobel3";
+                    break;
+                case 3:
+                    kernel_name = "sobel3";
+                    break;
+                case 5:
+                    kernel_name = "sobel5";
+                    break;
+                case 7:
+                    kernel_name = "sobel7";
+                    break;
+                default:
+                    CV_Error(CV_StsBadFlag, "Kernel size is not supported!");
+                    break;
+                }
+                openCLExecuteKernel(src.clCxt, &imgproc_sobel3, kernel_name, gt2, lt2, args, -1, -1, option.c_str() );
+            }
+            else
+            {
+                if (ksize > 0)
                 {
                     Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
                     Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
                 }
-            }
-            else
-            {
-                Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
-                Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+                else
+                {
+                    Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
+                    Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+                }
             }
             CV_Assert(Dx.offset == 0 && Dy.offset == 0);
         }
diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl
index bf3b6cfa7..5c236f0e0 100644
--- a/modules/ocl/src/opencl/cvt_color.cl
+++ b/modules/ocl/src/opencl/cvt_color.cl
@@ -56,35 +56,59 @@
 
 #ifdef DEPTH_0
 #define DATA_TYPE uchar
+#define VECTOR2 uchar2
+#define VECTOR4 uchar4
+#define VECTOR8 uchar8
+#define VECTOR16 uchar16
 #define COEFF_TYPE int
 #define MAX_NUM  255
 #define HALF_MAX 128
 #define SAT_CAST(num) convert_uchar_sat_rte(num)
+#define SAT_CAST2(num) convert_uchar2_sat(num)
+#define SAT_CAST4(num) convert_uchar4_sat(num)
 #endif
 
 #ifdef DEPTH_2
 #define DATA_TYPE ushort
+#define VECTOR2 ushort2
+#define VECTOR4 ushort4
+#define VECTOR8 ushort8
+#define VECTOR16 ushort16
 #define COEFF_TYPE int
 #define MAX_NUM  65535
 #define HALF_MAX 32768
 #define SAT_CAST(num) convert_ushort_sat_rte(num)
+#define SAT_CAST2(num) convert_ushort2_sat(num)
+#define SAT_CAST4(num) convert_ushort4_sat(num)
 #endif
 
 #ifdef DEPTH_5
 #define DATA_TYPE float
+#define VECTOR2 float2
+#define VECTOR4 float4
+#define VECTOR8 float8
+#define VECTOR16 float16
 #define COEFF_TYPE float
 #define MAX_NUM  1.0f
 #define HALF_MAX 0.5f
 #define SAT_CAST(num) (num)
 #endif
 
+#ifndef bidx
+    #define bidx 0
+#endif
+
+#ifndef pixels_per_work_item
+    #define pixels_per_work_item 1
+#endif
+
 #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
 
 enum
 {
     yuv_shift  = 14,
     xyz_shift  = 12,
-    hsv_shift = 12,
+    hsv_shift  = 12,
     R2Y        = 4899,
     G2Y        = 9617,
     B2Y        = 1868,
@@ -93,26 +117,87 @@ enum
 
 ///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
 
+__constant float c_RGB2GrayCoeffs_f[3]  = { 0.114f, 0.587f, 0.299f };
+__constant int   c_RGB2GrayCoeffs_i[3]  = { B2Y, G2Y, R2Y };
+
 __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step,
-                       int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                       __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                        int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
     int y = get_global_id(1);
 
     if (y < rows && x < cols)
     {
         int src_idx = mad24(y, src_step, src_offset + (x << 2));
         int dst_idx = mad24(y, dst_step, dst_offset + x);
+
+#ifndef INTEL_DEVICE
+
 #ifdef DEPTH_5
         dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f;
 #else
         dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
 #endif
+
+#else   //INTEL_DEVICE
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+#ifdef DEPTH_5
+        __constant float * coeffs = c_RGB2GrayCoeffs_f;
+#else
+        __constant int * coeffs = c_RGB2GrayCoeffs_i;
+#endif
+
+#if (1 == pixels_per_work_item)
+        {
+#ifdef DEPTH_5
+            *dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2];
+#else
+            *dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift);
+#endif
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 c0 = r0.s04;
+            const float2 c1 = r0.s15;
+            const float2 c2 = r0.s26;
+
+            const float2 Y = c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2];
+#else
+            const int2 c0 = convert_int2(r0.s04);
+            const int2 c1 = convert_int2(r0.s15);
+            const int2 c2 = convert_int2(r0.s26);
+
+            const int2 yi = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift);
+            const VECTOR2 Y = SAT_CAST2(yi);
+#endif
+
+            vstore2(Y, 0, dst_ptr);
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 c0 = convert_int4(r0.s048c);
+            const int4 c1 = convert_int4(r0.s159d);
+            const int4 c2 = convert_int4(r0.s26ae);
+            const int4 Y = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift);
+
+            vstore4(SAT_CAST4(Y), 0, dst_ptr);
+#endif
+        }
+#endif //pixels_per_work_item
+#endif //INTEL_DEVICE
     }
 }
 
-__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step,
                        __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                        int src_offset, int dst_offset)
 {
@@ -140,10 +225,10 @@ __constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877
 __constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };
 
 __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                       int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
     int y = get_global_id(1);
 
     if (y < rows && x < cols)
@@ -151,24 +236,84 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
         x <<= 2;
         int src_idx = mad24(y, src_step, src_offset + x);
         int dst_idx = mad24(y, dst_step, dst_offset + x);
-        DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
 
 #ifdef DEPTH_5
         __constant float * coeffs = c_RGB2YUVCoeffs_f;
-        DATA_TYPE Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
-        DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
 #else
         __constant int * coeffs = c_RGB2YUVCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
-        int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+        const int delta = HALF_MAX * (1 << yuv_shift);
 #endif
 
-        dst[dst_idx] = SAT_CAST( Y );
-        dst[dst_idx + 1] = SAT_CAST( Cr );
-        dst[dst_idx + 2] = SAT_CAST( Cb );
+#if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
+            float U = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
+            float V = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
+#else
+            int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
+            int U = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
+            int V = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+#endif
+
+            dst_ptr[0] = SAT_CAST( Y );
+            dst_ptr[1] = SAT_CAST( U );
+            dst_ptr[2] = SAT_CAST( V );
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 c0 = r0.s04;
+            const float2 c1 = r0.s15;
+            const float2 c2 = r0.s26;
+
+            const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]);
+            const float2 U = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX);
+            const float2 V = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX);
+#else
+            const int2 c0 = convert_int2(r0.s04);
+            const int2 c1 = convert_int2(r0.s15);
+            const int2 c2 = convert_int2(r0.s26);
+
+            const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR2 Y = SAT_CAST2(yi);
+            const VECTOR2 U = SAT_CAST2(ui);
+            const VECTOR2 V = SAT_CAST2(vi);
+#endif
+
+            vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr);
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 c0 = convert_int4(r0.s048c);
+            const int4 c1 = convert_int4(r0.s159d);
+            const int4 c2 = convert_int4(r0.s26ae);
+
+            const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR4 Y = SAT_CAST4(yi);
+            const VECTOR4 U = SAT_CAST4(ui);
+            const VECTOR4 V = SAT_CAST4(vi);
+
+            vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr);
+#endif
+        }
+#endif //pixels_per_work_item
     }
 }
 
@@ -176,10 +321,10 @@ __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
 __constant int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
 
 __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                       int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
     int y = get_global_id(1);
 
     if (y < rows && x < cols)
@@ -187,26 +332,94 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
         x <<= 2;
         int src_idx = mad24(y, src_step, src_offset + x);
         int dst_idx = mad24(y, dst_step, dst_offset + x);
-        DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
 
 #ifdef DEPTH_5
         __constant float * coeffs = c_YUV2RGBCoeffs_f;
-        float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
-        float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
-        float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
 #else
         __constant int * coeffs = c_YUV2RGBCoeffs_i;
-        int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
-        int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
-        int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
 #endif
 
-        dst[dst_idx + bidx] = SAT_CAST( b );
-        dst[dst_idx + 1]      = SAT_CAST( g );
-        dst[dst_idx + (bidx^2)]   = SAT_CAST( r );
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+#if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float B = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
+            float G = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
+            float R = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
+#else
+            int B = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
+            int G = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
+            int R = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
 #endif
+
+            dst_ptr[bidx]     = SAT_CAST( B );
+            dst_ptr[1]        = SAT_CAST( G );
+            dst_ptr[(bidx^2)] = SAT_CAST( R );
+#if dcn == 4
+            dst_ptr[3]         = MAX_NUM;
+#endif
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 Y = r0.s04;
+            const float2 U = r0.s15;
+            const float2 V = r0.s26;
+
+            const float2 c0 = (bidx == 0) ? (Y + (V - HALF_MAX) * coeffs[3]) : (Y + (U - HALF_MAX) * coeffs[0]);
+            const float2 c1 = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
+            const float2 c2 = (bidx == 0) ? (Y + (U - HALF_MAX) * coeffs[0]) : (Y + (V - HALF_MAX) * coeffs[3]);
+#else
+            const int2 Y = convert_int2(r0.s04);
+            const int2 U = convert_int2(r0.s15);
+            const int2 V = convert_int2(r0.s26);
+
+            const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift));
+            const int2 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+            const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR2 c0 = SAT_CAST2(c0i);
+            const VECTOR2 c1 = SAT_CAST2(c1i);
+            const VECTOR2 c2 = SAT_CAST2(c2i);
+#endif
+
+#if dcn == 4
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr);
+#else
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr);
+#endif
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 Y = convert_int4(r0.s048c);
+            const int4 U = convert_int4(r0.s159d);
+            const int4 V = convert_int4(r0.s26ae);
+
+            const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift));
+            const int4 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+            const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR4 c0 = SAT_CAST4(c0i);
+            const VECTOR4 c1 = SAT_CAST4(c1i);
+            const VECTOR4 c2 = SAT_CAST4(c2i);
+
+#if dcn == 4
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr);
+#else
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr);
+#endif
+#endif
+        }
+#endif  //pixels_per_work_item
     }
 }
 
@@ -218,7 +431,7 @@ __constant int ITUR_BT_601_CVR = 1673527;
 __constant int ITUR_BT_601_SHIFT = 20;
 
 __kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step,
-                            int bidx, __global const uchar* src, __global uchar* dst,
+                            __global const uchar* src, __global uchar* dst,
                             int src_offset, int dst_offset)
 {
     const int x = get_global_id(0);
@@ -275,10 +488,10 @@ __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564
 __constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
 
 __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                        int src_offset, int dst_offset)
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
     int y = get_global_id(1);
 
     if (y < rows && x < cols)
@@ -287,24 +500,82 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
         int src_idx = mad24(y, src_step, src_offset + x);
         int dst_idx = mad24(y, dst_step, dst_offset + x);
 
-        DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
 
 #ifdef DEPTH_5
         __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
-        DATA_TYPE Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
-        DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
 #else
         __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
-        int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+        const int delta = HALF_MAX * (1 << yuv_shift);
 #endif
 
-        dst[dst_idx] = SAT_CAST( Y );
-        dst[dst_idx + 1] = SAT_CAST( Cr );
-        dst[dst_idx + 2] = SAT_CAST( Cb );
+#if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
+            float Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
+            float Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
+#else
+            int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
+            int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
+            int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
+#endif
+
+            dst_ptr[0] = SAT_CAST( Y );
+            dst_ptr[1] = SAT_CAST( Cr );
+            dst_ptr[2] = SAT_CAST( Cb );
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 c0 = r0.s04;
+            const float2 c1 = r0.s15;
+            const float2 c2 = r0.s26;
+
+            const float2 Y  = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]);
+            const float2 Cr = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX);
+            const float2 Cb = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX);
+#else
+            const int2 c0 = convert_int2(r0.s04);
+            const int2 c1 = convert_int2(r0.s15);
+            const int2 c2 = convert_int2(r0.s26);
+
+            const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR2 Y  = SAT_CAST2(yi);
+            const VECTOR2 Cr = SAT_CAST2(ui);
+            const VECTOR2 Cb = SAT_CAST2(vi);
+#endif
+
+            vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr);
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+            const int4 c0 = convert_int4(r0.s048c);
+            const int4 c1 = convert_int4(r0.s159d);
+            const int4 c2 = convert_int4(r0.s26ae);
+
+            const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift);
+            const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift);
+            const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift);
+
+            const VECTOR4 Y  = SAT_CAST4(yi);
+            const VECTOR4 Cr = SAT_CAST4(ui);
+            const VECTOR4 Cb = SAT_CAST4(vi);
+
+            vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr);
+#endif
+        }
+#endif //pixels_per_work_item
     }
 }
 
@@ -312,10 +583,10 @@ __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
 __constant int   c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
 
 __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                        int src_offset, int dst_offset)
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
     int y = get_global_id(1);
 
     if (y < rows && x < cols)
@@ -324,36 +595,103 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
         int src_idx = mad24(y, src_step, src_offset + x);
         int dst_idx = mad24(y, dst_step, dst_offset + x);
 
-        DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
 
 #ifdef DEPTH_5
-        __constant float * coeff = c_YCrCb2RGBCoeffs_f;
-        float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX);
-        float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX);
-        float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX);
+        __constant float * coeffs = c_YCrCb2RGBCoeffs_f;
 #else
-        __constant int * coeff = c_YCrCb2RGBCoeffs_i;
-        int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift);
-        int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift);
-        int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift);
+        __constant int * coeffs = c_YCrCb2RGBCoeffs_i;
 #endif
 
-        dst[dst_idx + (bidx^2)] = SAT_CAST(r);
-        dst[dst_idx + 1] = SAT_CAST(g);
-        dst[dst_idx + bidx] = SAT_CAST(b);
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+#if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]};
+
+#ifdef DEPTH_5
+            float B = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[3];
+            float G = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1];
+            float R = ycrcb[0] + (ycrcb[1] - HALF_MAX) * coeffs[0];
+#else
+            int B = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[3], yuv_shift);
+            int G = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1], yuv_shift);
+            int R = ycrcb[0] + CV_DESCALE((ycrcb[1] - HALF_MAX) * coeffs[0], yuv_shift);
 #endif
+
+            dst_ptr[bidx]     = SAT_CAST( B );
+            dst_ptr[1]        = SAT_CAST( G );
+            dst_ptr[(bidx^2)] = SAT_CAST( R );
+#if dcn == 4
+            dst_ptr[3]         = MAX_NUM;
+#endif
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 Y  = r0.s04;
+            const float2 Cr = r0.s15;
+            const float2 Cb = r0.s26;
+
+            const float2 c0 = (bidx == 0) ? (Y + (Cb - HALF_MAX) * coeffs[3]) : (Y + (Cr - HALF_MAX) * coeffs[0]);
+            const float2 c1 = Y + (Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1];
+            const float2 c2 = (bidx == 0) ? (Y + (Cr - HALF_MAX) * coeffs[0]) : (Y + (Cb - HALF_MAX) * coeffs[3]);
+#else
+            const int2 Y  = convert_int2(r0.s04);
+            const int2 Cr = convert_int2(r0.s15);
+            const int2 Cb = convert_int2(r0.s26);
+
+            const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift));
+            const int2 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift);
+            const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR2 c0 = SAT_CAST2(c0i);
+            const VECTOR2 c1 = SAT_CAST2(c1i);
+            const VECTOR2 c2 = SAT_CAST2(c2i);
+#endif
+
+#if dcn == 4
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr);
+#else
+            vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr);
+#endif
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 Y  = convert_int4(r0.s048c);
+            const int4 Cr = convert_int4(r0.s159d);
+            const int4 Cb = convert_int4(r0.s26ae);
+
+            const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift));
+            const int4 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift);
+            const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift));
+
+            const VECTOR4 c0 = SAT_CAST4(c0i);
+            const VECTOR4 c1 = SAT_CAST4(c1i);
+            const VECTOR4 c2 = SAT_CAST4(c2i);
+
+#if dcn == 4
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr);
+#else
+            vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr);
+#endif
+#endif
+        }
+#endif //pixels_per_work_item
     }
 }
 
 ///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
 
 __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                       int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
 {
-    int dx = get_global_id(0);
+    int dx = get_global_id(0) * pixels_per_work_item;
     int dy = get_global_id(1);
 
     if (dy < rows && dx < cols)
@@ -362,28 +700,84 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
         int src_idx = mad24(dy, src_step, src_offset + dx);
         int dst_idx = mad24(dy, dst_step, dst_offset + dx);
 
-        DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2];
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+#if (1 == pixels_per_work_item)
+        {
+            DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2];
 
 #ifdef DEPTH_5
-        float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
-        float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
-        float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
+            float X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2];
+            float Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5];
+            float Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8];
 #else
-        int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
-        int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
-        int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
+            int X = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
+            int Y = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
+            int Z = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
 #endif
-        dst[dst_idx] = SAT_CAST(x);
-        dst[dst_idx + 1] = SAT_CAST(y);
-        dst[dst_idx + 2] = SAT_CAST(z);
+
+            dst_ptr[0] = SAT_CAST( X );
+            dst_ptr[1] = SAT_CAST( Y );
+            dst_ptr[2] = SAT_CAST( Z );
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 R = r0.s04;
+            const float2 G = r0.s15;
+            const float2 B = r0.s26;
+
+            const float2 X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2];
+            const float2 Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5];
+            const float2 Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8];
+#else
+            const int2 R = convert_int2(r0.s04);
+            const int2 G = convert_int2(r0.s15);
+            const int2 B = convert_int2(r0.s26);
+
+            const int2 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
+            const int2 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
+            const int2 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
+
+            const VECTOR2 X = SAT_CAST2(xi);
+            const VECTOR2 Y = SAT_CAST2(yi);
+            const VECTOR2 Z = SAT_CAST2(zi);
+#endif
+
+            vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr);
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 R = convert_int4(r0.s048c);
+            const int4 G = convert_int4(r0.s159d);
+            const int4 B = convert_int4(r0.s26ae);
+
+            const int4 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift);
+            const int4 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift);
+            const int4 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift);
+
+            const VECTOR4 X = SAT_CAST4(xi);
+            const VECTOR4 Y = SAT_CAST4(yi);
+            const VECTOR4 Z = SAT_CAST4(zi);
+
+            vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr);
+#endif
+        }
+#endif //pixels_per_work_item
     }
 }
 
 __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
+                      __global const DATA_TYPE* src, __global DATA_TYPE* dst,
                       int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
 {
-    int dx = get_global_id(0);
+    int dx = get_global_id(0) * pixels_per_work_item;
     int dy = get_global_id(1);
 
     if (dy < rows && dx < cols)
@@ -392,23 +786,87 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
         int src_idx = mad24(dy, src_step, src_offset + dx);
         int dst_idx = mad24(dy, dst_step, dst_offset + dx);
 
-        DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2];
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+#if (1 == pixels_per_work_item)
+        {
+            const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2];
 
 #ifdef DEPTH_5
-        float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
-        float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
-        float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
+            float B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2];
+            float G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5];
+            float R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8];
 #else
-        int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
-        int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
-        int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
+            int B = CV_DESCALE(X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2], xyz_shift);
+            int G = CV_DESCALE(X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5], xyz_shift);
+            int R = CV_DESCALE(X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8], xyz_shift);
 #endif
-        dst[dst_idx] = SAT_CAST(b);
-        dst[dst_idx + 1] = SAT_CAST(g);
-        dst[dst_idx + 2] = SAT_CAST(r);
+
+            dst_ptr[0] = SAT_CAST( B );
+            dst_ptr[1] = SAT_CAST( G );
+            dst_ptr[2] = SAT_CAST( R );
 #if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
+            dst_ptr[3] = MAX_NUM;
 #endif
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const VECTOR8 r0 = vload8(0, src_ptr);
+
+#ifdef DEPTH_5
+            const float2 X = r0.s04;
+            const float2 Y = r0.s15;
+            const float2 Z = r0.s26;
+
+            float2 B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2];
+            float2 G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5];
+            float2 R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8];
+#else
+            const int2 xi = convert_int2(r0.s04);
+            const int2 yi = convert_int2(r0.s15);
+            const int2 zi = convert_int2(r0.s26);
+
+            const int2 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift);
+            const int2 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift);
+            const int2 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift);
+
+            const VECTOR2 R = SAT_CAST2(ri);
+            const VECTOR2 G = SAT_CAST2(gi);
+            const VECTOR2 B = SAT_CAST2(bi);
+#endif
+
+#if dcn == 4
+            vstore8((VECTOR8)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM), 0, dst_ptr);
+#else
+            vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr);
+#endif
+        }
+#elif (4 == pixels_per_work_item)
+        {
+#ifndef DEPTH_5
+            const VECTOR16 r0 = vload16(0, src_ptr);
+
+            const int4 xi = convert_int4(r0.s048c);
+            const int4 yi = convert_int4(r0.s159d);
+            const int4 zi = convert_int4(r0.s26ae);
+
+            const int4 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift);
+            const int4 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift);
+            const int4 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift);
+
+            const VECTOR4 R = SAT_CAST4(ri);
+            const VECTOR4 G = SAT_CAST4(gi);
+            const VECTOR4 B = SAT_CAST4(bi);
+
+#if dcn == 4
+            vstore16((VECTOR16)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM, B.s2, G.s2, R.s2, MAX_NUM, B.s3, G.s3, R.s3, MAX_NUM), 0, dst_ptr);
+#else
+            vstore16((VECTOR16)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0, B.s2, G.s2, R.s2, 0, B.s3, G.s3, R.s3, 0), 0, dst_ptr);
+#endif
+#endif
+        }
+#endif // pixels_per_work_item
     }
 }
 
@@ -427,6 +885,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step,
         int src_idx = mad24(y, src_step, src_offset + x);
         int dst_idx = mad24(y, dst_step, dst_offset + x);
 
+#ifndef INTEL_DEVICE
 #ifdef REVERSE
         dst[dst_idx] = src[src_idx + 2];
         dst[dst_idx + 1] = src[src_idx + 1];
@@ -444,12 +903,43 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step,
         dst[dst_idx + 3] = src[src_idx + 3];
 #endif
 #endif
+#else //INTEL_DEVICE
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+        const VECTOR4 r0 = vload4(0, src_ptr);
+#ifdef REVERSE
+        if (3 == dcn)
+        {
+            vstore4((VECTOR4)(r0.s210, 0), 0, dst_ptr);
+        }
+        else if (3 == scn)
+        {
+            vstore4((VECTOR4)(r0.s210, MAX_NUM), 0, dst_ptr);
+        }
+        else {
+            vstore4((VECTOR4)(r0.s2103), 0, dst_ptr);
+        }
+#elif defined ORDER
+        if (3 == dcn)
+        {
+            vstore4((VECTOR4)(r0.s012, 0), 0, dst_ptr);
+        }
+        else if (3 == scn)
+        {
+            vstore4((VECTOR4)(r0.s012, MAX_NUM), 0, dst_ptr);
+        }
+        else {
+            vstore4(r0, 0, dst_ptr);
+        }
+#endif
+#endif //INTEL_DEVICE
     }
 }
 
 ///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
 
-__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step,
                          __global const ushort * src, __global uchar * dst,
                          int src_offset, int dst_offset)
 {
@@ -482,7 +972,7 @@ __kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bid
     }
 }
 
-__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step,
                          __global const uchar * src, __global ushort * dst,
                          int src_offset, int dst_offset)
 {
@@ -507,7 +997,7 @@ __kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bid
 
 ///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
 
-__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step,
                           __global const ushort * src, __global uchar * dst,
                           int src_offset, int dst_offset)
 {
@@ -532,7 +1022,7 @@ __kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bi
     }
 }
 
-__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step,
                           __global const uchar * src, __global ushort * dst,
                           int src_offset, int dst_offset)
 {
@@ -560,7 +1050,7 @@ __constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2,
 
 #ifdef DEPTH_0
 
-__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step,
                       __global const uchar * src, __global uchar * dst,
                       int src_offset, int dst_offset,
                       __constant int * sdiv_table, __constant int * hdiv_table)
@@ -600,7 +1090,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
     }
 }
 
-__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step,
                       __global const uchar * src, __global uchar * dst,
                       int src_offset, int dst_offset)
 {
@@ -656,7 +1146,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
 
 #elif defined DEPTH_5
 
-__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step,
                       __global const float * src, __global float * dst,
                       int src_offset, int dst_offset)
 {
@@ -698,7 +1188,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
     }
 }
 
-__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step,
                       __global const float * src, __global float * dst,
                       int src_offset, int dst_offset)
 {
@@ -758,7 +1248,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
 
 #ifdef DEPTH_0
 
-__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step,
                       __global const uchar * src, __global uchar * dst,
                       int src_offset, int dst_offset)
 {
@@ -805,7 +1295,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
     }
 }
 
-__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step,
                       __global const uchar * src, __global uchar * dst,
                       int src_offset, int dst_offset)
 {
@@ -860,7 +1350,7 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
 
 #elif defined DEPTH_5
 
-__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step,
                       __global const float * src, __global float * dst,
                       int src_offset, int dst_offset)
 {
@@ -907,7 +1397,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
     }
 }
 
-__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
+__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step,
                       __global const float * src, __global float * dst,
                       int src_offset, int dst_offset)
 {
@@ -968,33 +1458,10 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
 #ifdef DEPTH_0
 
 __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const uchar * src, __global uchar * dst,
-                        int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
-
-        dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 3] = v3;
-    }
-}
-
-__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx,
                         __global const uchar * src, __global uchar * dst,
                         int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
+    int x = get_global_id(0) * pixels_per_work_item;
     int y = get_global_id(1);
 
     if (y < rows && x < cols)
@@ -1003,14 +1470,129 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bid
         int src_idx = mad24(y, src_step, src_offset + x);
         int dst_idx = mad24(y, dst_step, dst_offset + x);
 
-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
-        uchar v3_half = v3 / 2;
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
 
-        dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 3] = v3;
+#if (1 == pixels_per_work_item)
+        {
+            const uchar4 r0 = vload4(0, src_ptr);
+
+            dst_ptr[0] = (r0.s0 * r0.s3 + HALF_MAX) / MAX_NUM;
+            dst_ptr[1] = (r0.s1 * r0.s3 + HALF_MAX) / MAX_NUM;
+            dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM;
+            dst_ptr[3] = r0.s3;
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const uchar8 r0 = vload8(0, src_ptr);
+
+            const int2 v0 = convert_int2(r0.s04);
+            const int2 v1 = convert_int2(r0.s15);
+            const int2 v2 = convert_int2(r0.s26);
+            const int2 v3 = convert_int2(r0.s37);
+
+            const int2 ri = (v0 * v3 + HALF_MAX) / MAX_NUM;
+            const int2 gi = (v1 * v3 + HALF_MAX) / MAX_NUM;
+            const int2 bi = (v2 * v3 + HALF_MAX) / MAX_NUM;
+
+            const uchar2 r = convert_uchar2(ri);
+            const uchar2 g = convert_uchar2(gi);
+            const uchar2 b = convert_uchar2(bi);
+
+            vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr);
+        }
+#elif (4 == pixels_per_work_item)
+        {
+            const uchar16 r0 = vload16(0, src_ptr);
+
+            const int4 v0 = convert_int4(r0.s048c);
+            const int4 v1 = convert_int4(r0.s159d);
+            const int4 v2 = convert_int4(r0.s26ae);
+            const int4 v3 = convert_int4(r0.s37bf);
+
+            const int4 ri = (v0 * v3 + HALF_MAX) / MAX_NUM;
+            const int4 gi = (v1 * v3 + HALF_MAX) / MAX_NUM;
+            const int4 bi = (v2 * v3 + HALF_MAX) / MAX_NUM;
+
+            const uchar4 r = convert_uchar4(ri);
+            const uchar4 g = convert_uchar4(gi);
+            const uchar4 b = convert_uchar4(bi);
+
+            vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr);
+        }
+#endif // pixels_per_work_item
+    }
+}
+
+__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step,
+                        __global const uchar * src, __global uchar * dst,
+                        int src_offset, int dst_offset)
+{
+    int x = get_global_id(0) * pixels_per_work_item;
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        x <<= 2;
+        int src_idx = mad24(y, src_step, src_offset + x);
+        int dst_idx = mad24(y, dst_step, dst_offset + x);
+
+        global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx);
+        global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx);
+
+#if (1 == pixels_per_work_item)
+        {
+            const uchar4 r0 = vload4(0, src_ptr);
+            const uchar v3_half = r0.s3 / 2;
+
+            const uchar r = (r0.s3 == 0) ? 0 : (r0.s0 * MAX_NUM + v3_half) / r0.s3;
+            const uchar g = (r0.s3 == 0) ? 0 : (r0.s1 * MAX_NUM + v3_half) / r0.s3;
+            const uchar b = (r0.s3 == 0) ? 0 : (r0.s2 * MAX_NUM + v3_half) / r0.s3;
+
+            vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr);
+        }
+#elif (2 == pixels_per_work_item)
+        {
+            const uchar8 r0 = vload8(0, src_ptr);
+
+            const int2 v0 = convert_int2(r0.s04);
+            const int2 v1 = convert_int2(r0.s15);
+            const int2 v2 = convert_int2(r0.s26);
+            const int2 v3 = convert_int2(r0.s37);
+            const int2 v3_half = v3 / 2;
+
+            const int2 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3;
+            const int2 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3;
+            const int2 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3;
+
+            const uchar2 r = convert_uchar2(ri);
+            const uchar2 g = convert_uchar2(gi);
+            const uchar2 b = convert_uchar2(bi);
+
+            vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr);
+        }
+#elif (4 == pixels_per_work_item)
+        {
+            const uchar16 r0 = vload16(0, src_ptr);
+
+            const int4 v0 = convert_int4(r0.s048c);
+            const int4 v1 = convert_int4(r0.s159d);
+            const int4 v2 = convert_int4(r0.s26ae);
+            const int4 v3 = convert_int4(r0.s37bf);
+            const int4 v3_half = v3 / 2;
+
+
+            const int4 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3;
+            const int4 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3;
+            const int4 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3;
+
+            const uchar4 r = convert_uchar4(ri);
+            const uchar4 g = convert_uchar4(gi);
+            const uchar4 b = convert_uchar4(bi);
+
+            vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr);
+        }
+#endif // pixels_per_work_item
     }
 }
 
diff --git a/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl
new file mode 100644
index 000000000..c6555bff0
--- /dev/null
+++ b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define EXTRAPOLATE(x, maxV)
+#elif defined BORDER_REPLICATE
+//aaaaaa|abcdefgh|hhhhhhh
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = max(min((x), (maxV) - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+//cdefgh|abcdefgh|abcdefg
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = ( (x) + (maxV) ) % (maxV); \
+    }
+#elif defined BORDER_REFLECT
+//fedcba|abcdefgh|hgfedcb
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
+    }
+#elif defined BORDER_REFLECT_101
+//gfedcb|abcdefgh|gfedcba
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
+    }
+#else
+#error No extrapolation method
+#endif
+
+#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
+
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
+#else
+#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
+#endif
+
+#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x])
+
+//horizontal and vertical filter kernels
+//should be defined on host during compile time to avoid overhead
+__constant uint mat_kernelX[] = {KERNEL_MATRIX_X};
+__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y};
+
+__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass
+        (
+        __global uchar* Src,
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* Dst,
+        const int       DstOffset,
+        const uint      DstPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight
+        )
+{
+    //RADIUSX, RADIUSY are filter dimensions
+    //BLK_X, BLK_Y are local wrogroup sizes
+    //all these should be defined on host during compile time
+    //first lsmem array for source pixels used in first pass,
+    //second lsmemDy for storing first pass results
+    __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX];
+    __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX];
+
+    //get local and global ids - used as image and local memory array indexes
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
+
+    //calculate pixel position in source image taking image offset into account
+    int srcX = x + srcOffsetX - RADIUSX;
+    int srcY = y + srcOffsetY - RADIUSY;
+    int xb = srcX;
+    int yb = srcY;
+
+    //extrapolate coordinates, if needed
+    //and read my own source pixel into local memory
+    //with account for extra border pixels, which will be read by starting workitems
+    int clocY = liy;
+    int cSrcY = srcY;
+    do
+    {
+        int yb = cSrcY;
+        EXTRAPOLATE(yb, (height));
+
+        int clocX = lix;
+        int cSrcX = srcX;
+        do
+        {
+            int xb = cSrcX;
+            EXTRAPOLATE(xb,(width));
+            lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
+
+            clocX += BLK_X;
+            cSrcX += BLK_X;
+        }
+        while(clocX < BLK_X+(RADIUSX*2));
+
+        clocY += BLK_Y;
+        cSrcY += BLK_Y;
+    }
+    while(clocY < BLK_Y+(RADIUSY*2));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //do vertical filter pass
+    //and store intermediate results to second local memory array
+    int i;
+    WORKTYPE sum = 0.0f;
+    int clocX = lix;
+    do
+    {
+        sum = 0.0f;
+        for(i=0; i<=2*RADIUSY; i++)
+            sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum);
+        lsmemDy[liy][clocX] = sum;
+        clocX += BLK_X;
+    }
+    while(clocX < BLK_X+(RADIUSX*2));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //if this pixel happened to be out of image borders because of global size rounding,
+    //then just return
+    if( x >= dstWidth || y >=dstHeight )  return;
+
+    //do second horizontal filter pass
+    //and calculate final result
+    sum = 0.0f;
+    for(i=0; i<=2*RADIUSX; i++)
+        sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum);
+
+    //store result into destination image
+    DST(x,y) = CONVERT_DSTTYPE(sum);
+}
diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
index 980e85dd2..d6e5fb9ba 100644
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     )
 
 {
-// this version used information provided for each workgroup
-// no empty WG
     int     gid = (int)get_group_id(0);
     int     lid_x = (int)get_local_id(0);
     int     lid_y = (int)get_local_id(1);
     int     lid = lid_y*LSx+lid_x;
-    int4    WGInfo = pWGInfo[gid];
+    int4    WGInfo = pWGInfo[WGSTART+gid];
     int     GroupX = (WGInfo.y >> 16)&0xFFFF;
     int     GroupY = (WGInfo.y >> 0 )& 0xFFFF;
     int     Width  = (WGInfo.x >> 16)&0xFFFF;
@@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     int     ImgOffset = WGInfo.z;
     float   ScaleFactor = as_float(WGInfo.w);
 
-#define DATA_SIZE_X (LSx+WND_SIZE_X)
-#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
+#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X)
+#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y)
 #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
 
     local int SumL[DATA_SIZE];
@@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     int4    info1 = p;
     int4    info2 = pq;
 
-    {
-        int     xl = lid_x;
-        int     yl = lid_y;
+    // calc processed ROI coordinate in local mem
+    int     xl = lid_x*PIXEL_STEP;
+    int     yl = lid_y*PIXEL_STEP;
+
+    {// calc variance_norm_factor for all stages
         int     OffsetLocal =          yl * DATA_SIZE_X +         xl;
         int     OffsetGlobal = (GroupY+yl)* pixelstep   + (GroupX+xl);
 
@@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
 
     int result = (1.0f>0.0f);
     for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
-    {// iterate until candidate is exist
+    {// iterate until candidate is valid
         float   stage_sum = 0.0f;
         __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
             ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+        int     lcl_off = (yl*DATA_SIZE_X)+(xl);
         int stagecount = stageinfo->count;
         float stagethreshold = stageinfo->threshold;
-        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
         for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
         {
         // simple macro to extract shorts from int
@@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
             int4    n1 = pN[1];
             int4    n2 = pN[2];
             float   nodethreshold  = as_float(n2.y) * variance_norm_factor;
-            // calc sum of intensity pixels according to node information
+            // calc sum of intensity pixels according to classifier node information
             float classsum =
                 (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
                 (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
@@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
         int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
         if(index<OUTPUTSZ)
         {
-            int     x = GroupX+lid_x;
-            int     y = GroupY+lid_y;
+            int     x = GroupX+xl;
+            int     y = GroupY+yl;
             int4 candidate_result;
             candidate_result.x = convert_int_rtn(x*ScaleFactor);
             candidate_result.y = convert_int_rtn(y*ScaleFactor);
diff --git a/modules/ocl/src/opencl/imgproc_gftt.cl b/modules/ocl/src/opencl/imgproc_gftt.cl
index 80bdec08f..4d5356cfb 100644
--- a/modules/ocl/src/opencl/imgproc_gftt.cl
+++ b/modules/ocl/src/opencl/imgproc_gftt.cl
@@ -46,33 +46,26 @@
 #ifndef WITH_MASK
 #define WITH_MASK 0
 #endif
-
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
-{
-    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
-}
-
-inline float ELEM_FLT2(image2d_t _eig, float2 pt)
-{
-    return read_imagef(_eig, sampler, pt).x;
-}
+//macro to read eigenvalue matrix
+#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]
 
 __kernel
     void findCorners
     (
-        image2d_t eig,
-        __global const char * mask,
-        __global float2 * corners,
-        const int mask_strip,// in pixels
-        const float threshold,
-        const int rows,
-        const int cols,
-        const int max_count,
-        __global int * g_counter
+        __global const char*    eig,
+        const int               eig_pitch,
+        __global const char*    mask,
+        __global float2*        corners,
+        const int               mask_strip,// in pixels
+        __global const float*   pMinMax,
+        const float             qualityLevel,
+        const int               rows,
+        const int               cols,
+        const int               max_count,
+        __global int*           g_counter
     )
 {
+    float threshold = qualityLevel*pMinMax[1];
     const int j = get_global_id(0);
     const int i = get_global_id(1);
 
@@ -82,39 +75,42 @@ __kernel
 #endif
         )
     {
-        const float val = ELEM_INT2(eig, j, i);
+        const float val = GET_SRC_32F(j, i);
 
         if (val > threshold)
         {
             float maxVal = val;
+            maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j    , i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);
 
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);
 
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
-
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j    , i + 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);
 
             if (val == maxVal)
             {
                 const int ind = atomic_inc(g_counter);
 
                 if (ind < max_count)
-                    corners[ind] = (float2)(j, i);
+                {// pack and store eigenvalue and its coordinates
+                    corners[ind].x = val;
+                    corners[ind].y = as_float(j|(i<<16));
+                }
             }
         }
     }
 }
+#undef GET_SRC_32F
+
 
 //bitonic sort
 __kernel
     void sortCorners_bitonicSort
     (
-        image2d_t eig,
         __global float2 * corners,
         const int count,
         const int stage,
@@ -140,8 +136,8 @@ __kernel
     const float2 leftPt  = corners[leftId];
     const float2 rightPt = corners[rightId];
 
-    const float leftVal  = ELEM_FLT2(eig, leftPt);
-    const float rightVal = ELEM_FLT2(eig, rightPt);
+    const float leftVal  = leftPt.x;
+    const float rightVal = rightPt.x;
 
     const bool compareResult = leftVal > rightVal;
 
@@ -152,124 +148,22 @@ __kernel
     corners[rightId] = sortOrder ? greater : lesser;
 }
 
-//selection sort for gfft
-//kernel is ported from Bolt library:
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
-//  Local sort will firstly sort elements of each workgroup using selection sort
-//  its performance is O(n)
-__kernel
-    void sortCorners_selectionSortLocal
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count,
-        __local float2 * scratch
-    )
+// this is simple short serial kernel that makes some short reduction and initialization work
+// it makes HOST like work to avoid additional sync with HOST to do this short work
+// data - input/output float2.
+//      input data are sevral (min,max) pairs
+//      output data is one reduced (min,max) pair
+// g_counter - counter that have to be initialized by 0 for next findCorner call.
+__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
 {
-    int          i  = get_local_id(0); // index in workgroup
-    int numOfGroups = get_num_groups(0); // index in workgroup
-    int groupID     = get_group_id(0);
-    int         wg  = get_local_size(0); // workgroup size = block size
-    int n; // number of elements to be processed for this work group
-
-    int offset   = groupID * wg;
-    int same     = 0;
-    corners      += offset;
-    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
-    float2 pt1, pt2;
-
-    pt1 = corners[min(i, n)];
-    scratch[i] = pt1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(i >= n)
+    g_counter[0] = 0;
+    float minVal = data[0];
+    float maxVal = data[groupnum];
+    for(int i=1;i<groupnum;++i)
     {
-        return;
+        minVal = min(minVal,data[i]);
+        maxVal = max(maxVal,data[i+groupnum]);
     }
-
-    float val1 = ELEM_FLT2(eig, pt1);
-    float val2;
-
-    int pos = 0;
-    for (int j=0;j<n;++j)
-    {
-        pt2  = scratch[j];
-        val2 = ELEM_FLT2(eig, pt2);
-        if(val2 > val1)
-            pos++;//calculate the rank of this element in this work group
-        else
-        {
-            if(val1 > val2)
-                continue;
-            else
-            {
-                // val1 and val2 are same
-                same++;
-            }
-        }
-    }
-    for (int j=0; j< same; j++)
-        corners[pos + j] = pt1;
-}
-__kernel
-    void sortCorners_selectionSortFinal
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count
-    )
-{
-    const int          i  = get_local_id(0); // index in workgroup
-    const int numOfGroups = get_num_groups(0); // index in workgroup
-    const int groupID     = get_group_id(0);
-    const int         wg  = get_local_size(0); // workgroup size = block size
-    int pos = 0, same = 0;
-    const int offset = get_group_id(0) * wg;
-    const int remainder = count - wg*(numOfGroups-1);
-
-    if((offset + i ) >= count)
-        return;
-    float2 pt1, pt2;
-    pt1 = corners[groupID*wg + i];
-
-    float val1 = ELEM_FLT2(eig, pt1);
-    float val2;
-
-    for(int j=0; j<numOfGroups-1; j++ )
-    {
-        for(int k=0; k<wg; k++)
-        {
-            pt2  = corners[j*wg + k];
-            val2 = ELEM_FLT2(eig, pt2);
-            if(val1 > val2)
-                break;
-            else
-            {
-                //Increment only if the value is not the same.
-                if( val2 > val1 )
-                    pos++;
-                else
-                    same++;
-            }
-        }
-    }
-
-    for(int k=0; k<remainder; k++)
-    {
-        pt2  = corners[(numOfGroups-1)*wg + k];
-        val2 = ELEM_FLT2(eig, pt2);
-        if(val1 > val2)
-            break;
-        else
-        {
-            //Don't increment if the value is the same.
-            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
-            if(val2 > val1)
-                pos++;
-            else
-                same++;
-        }
-    }
-    for (int j=0; j< same; j++)
-        corners[pos + j] = pt1;
-}
+    data[0] = minVal;
+    data[1] = maxVal;
+}
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/imgproc_sobel3.cl b/modules/ocl/src/opencl/imgproc_sobel3.cl
index d6a995f55..8356fce01 100644
--- a/modules/ocl/src/opencl/imgproc_sobel3.cl
+++ b/modules/ocl/src/opencl/imgproc_sobel3.cl
@@ -1,45 +1,97 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define EXTRAPOLATE(x, maxV)
+#elif defined BORDER_REPLICATE
+//aaaaaa|abcdefgh|hhhhhhh
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = max(min((x), (maxV) - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+//cdefgh|abcdefgh|abcdefg
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = ( (x) + (maxV) ) % (maxV); \
+    }
+#elif defined BORDER_REFLECT
+//fedcba|abcdefgh|hgfedcb
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \
+    }
+#elif defined BORDER_REFLECT_101
+//gfedcb|abcdefgh|gfedcba
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \
+    }
+#else
+#error No extrapolation method
 #endif
 
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
+
+#ifdef BORDER_CONSTANT
+//CCCCCC|abcdefgh|CCCCCCC
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
+#else
+#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
 #endif
 
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
+#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x])
+#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x])
 
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
+#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \
+    int srcX = x + srcOffsetX - (kernel_border); \
+    int srcY = y + srcOffsetY - (kernel_border); \
+    int xb = srcX; \
+    int yb = srcY; \
+    \
+    EXTRAPOLATE(xb, (width)); \
+    EXTRAPOLATE(yb, (height)); \
+    lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    \
+    if(lix < ((kernel_border)*2)) \
+    { \
+        int xb = srcX+BLK_X; \
+        EXTRAPOLATE(xb,(width)); \
+        lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    } \
+    if(liy< ((kernel_border)*2)) \
+    { \
+        int yb = srcY+BLK_Y; \
+        EXTRAPOLATE(yb, (height)); \
+        lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    } \
+    if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \
+    { \
+        int xb = srcX+BLK_X; \
+        int yb = srcY+BLK_Y; \
+        EXTRAPOLATE(xb,(width)); \
+        EXTRAPOLATE(yb,(height)); \
+        lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
+    }
 
 __kernel void sobel3(
         __global uchar* Src,
-        __global float* DstX,
-        __global float* DstY,
-        int width, int height,
-        uint srcStride, uint dstStride,
-        float scale
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* DstX,
+        const int       DstXOffset,
+        const uint      DstXPitch,
+        __global uchar* DstY,
+        const int       DstYOffset,
+        const uint      DstYPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight,
+        float           scale
         )
 {
     __local float lsmem[BLK_Y+2][BLK_X+2];
@@ -47,62 +99,249 @@ __kernel void sobel3(
     int lix = get_local_id(0);
     int liy = get_local_id(1);
 
-    int gix = get_group_id(0);
-    int giy = get_group_id(1);
-
-    int id_x = get_global_id(0);
-    int id_y = get_global_id(1);
-
-    lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
-
-    int id_y_h = ADDR_H(id_y-1, 0,height);
-    int id_y_b = ADDR_B(id_y+1, height,id_y+1);
-
-    int id_x_l = ADDR_L(id_x-1, 0,width);
-    int id_x_r = ADDR_R(id_x+1, width,id_x+1);
-
-    if(liy==0)
-    {
-        lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
-
-        if(lix==0)
-            lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
-        else if(lix==BLK_X-1)
-            lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
-    }
-    else if(liy==BLK_Y-1)
-    {
-        lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
-
-        if(lix==0)
-            lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
-        else if(lix==BLK_X-1)
-            lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
-    }
-
-    if(lix==0)
-        lsmem[liy+1][0]    = convert_float(Src[ id_y * srcStride + id_x_l ]);
-    else if(lix==BLK_X-1)
-        lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
 
+    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1)
     barrier(CLK_LOCAL_MEM_FENCE);
 
+    if( x >= dstWidth || y >=dstHeight )  return;
+
     float u1 = lsmem[liy][lix];
     float u2 = lsmem[liy][lix+1];
     float u3 = lsmem[liy][lix+2];
 
     float m1 = lsmem[liy+1][lix];
-    float m2 = lsmem[liy+1][lix+1];
     float m3 = lsmem[liy+1][lix+2];
 
     float b1 = lsmem[liy+2][lix];
     float b2 = lsmem[liy+2][lix+1];
     float b3 = lsmem[liy+2][lix+2];
 
-    //m2 * scale;//
-    float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
-    DstX[ id_y * dstStride + id_x ] = dx * scale;
+    //calc and store dx and dy;//
+#ifdef SCHARR
+    DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale;
+    DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale;
+#else
+    DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale;
+    DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale;
+#endif
+}
 
-    float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
-    DstY[ id_y * dstStride + id_x ] = dy * scale;
-}
\ No newline at end of file
+__kernel void sobel5(
+        __global uchar* Src,
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* DstX,
+        const int       DstXOffset,
+        const uint      DstXPitch,
+        __global uchar* DstY,
+        const int       DstYOffset,
+        const uint      DstYPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight,
+        float           scale
+        )
+{
+    __local float lsmem[BLK_Y+4][BLK_X+4];
+
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
+
+    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2)
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if( x >= dstWidth || y >=dstHeight )  return;
+
+    float t1 = lsmem[liy][lix];
+    float t2 = lsmem[liy][lix+1];
+    float t3 = lsmem[liy][lix+2];
+    float t4 = lsmem[liy][lix+3];
+    float t5 = lsmem[liy][lix+4];
+
+    float u1 = lsmem[liy+1][lix];
+    float u2 = lsmem[liy+1][lix+1];
+    float u3 = lsmem[liy+1][lix+2];
+    float u4 = lsmem[liy+1][lix+3];
+    float u5 = lsmem[liy+1][lix+4];
+
+    float m1 = lsmem[liy+2][lix];
+    float m2 = lsmem[liy+2][lix+1];
+    float m4 = lsmem[liy+2][lix+3];
+    float m5 = lsmem[liy+2][lix+4];
+
+    float l1 = lsmem[liy+3][lix];
+    float l2 = lsmem[liy+3][lix+1];
+    float l3 = lsmem[liy+3][lix+2];
+    float l4 = lsmem[liy+3][lix+3];
+    float l5 = lsmem[liy+3][lix+4];
+
+    float b1 = lsmem[liy+4][lix];
+    float b2 = lsmem[liy+4][lix+1];
+    float b3 = lsmem[liy+4][lix+2];
+    float b4 = lsmem[liy+4][lix+3];
+    float b5 = lsmem[liy+4][lix+4];
+
+    //calc and store dx and dy;//
+    DSTX(x,y) = scale *
+        mad(12.0f, m4 - m2,
+            mad(6.0f, m5 - m1,
+                mad(8.0f, u4 - u2 + l4 - l2,
+                    mad(4.0f, u5 - u1 + l5 - l1,
+                        mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 )
+                        )
+                    )
+                )
+            );
+
+    DSTY(x,y) = scale *
+        mad(12.0f, l3 - u3,
+            mad(6.0f, b3 - t3,
+                mad(8.0f, l2 - u2 + l4 - u4,
+                    mad(4.0f, b2 - t2 + b4 - t4,
+                        mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 )
+                        )
+                    )
+                )
+            );
+}
+
+__kernel void sobel7(
+        __global uchar* Src,
+        const uint      SrcPitch,
+        const int       srcOffsetX,
+        const int       srcOffsetY,
+        __global uchar* DstX,
+        const int       DstXOffset,
+        const uint      DstXPitch,
+        __global uchar* DstY,
+        const int       DstYOffset,
+        const uint      DstYPitch,
+        int             width,
+        int             height,
+        int             dstWidth,
+        int             dstHeight,
+        float           scale
+        )
+{
+    __local float lsmem[BLK_Y+6][BLK_X+6];
+
+    int lix = get_local_id(0);
+    int liy = get_local_id(1);
+
+    int x = (int)get_global_id(0);
+    int y = (int)get_global_id(1);
+
+    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3)
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if( x >= dstWidth || y >=dstHeight )  return;
+
+    float tt1 = lsmem[liy][lix];
+    float tt2 = lsmem[liy][lix+1];
+    float tt3 = lsmem[liy][lix+2];
+    float tt4 = lsmem[liy][lix+3];
+    float tt5 = lsmem[liy][lix+4];
+    float tt6 = lsmem[liy][lix+5];
+    float tt7 = lsmem[liy][lix+6];
+
+    float t1 = lsmem[liy+1][lix];
+    float t2 = lsmem[liy+1][lix+1];
+    float t3 = lsmem[liy+1][lix+2];
+    float t4 = lsmem[liy+1][lix+3];
+    float t5 = lsmem[liy+1][lix+4];
+    float t6 = lsmem[liy+1][lix+5];
+    float t7 = lsmem[liy+1][lix+6];
+
+    float u1 = lsmem[liy+2][lix];
+    float u2 = lsmem[liy+2][lix+1];
+    float u3 = lsmem[liy+2][lix+2];
+    float u4 = lsmem[liy+2][lix+3];
+    float u5 = lsmem[liy+2][lix+4];
+    float u6 = lsmem[liy+2][lix+5];
+    float u7 = lsmem[liy+2][lix+6];
+
+    float m1 = lsmem[liy+3][lix];
+    float m2 = lsmem[liy+3][lix+1];
+    float m3 = lsmem[liy+3][lix+2];
+    float m5 = lsmem[liy+3][lix+4];
+    float m6 = lsmem[liy+3][lix+5];
+    float m7 = lsmem[liy+3][lix+6];
+
+    float l1 = lsmem[liy+4][lix];
+    float l2 = lsmem[liy+4][lix+1];
+    float l3 = lsmem[liy+4][lix+2];
+    float l4 = lsmem[liy+4][lix+3];
+    float l5 = lsmem[liy+4][lix+4];
+    float l6 = lsmem[liy+4][lix+5];
+    float l7 = lsmem[liy+4][lix+6];
+
+    float b1 = lsmem[liy+5][lix];
+    float b2 = lsmem[liy+5][lix+1];
+    float b3 = lsmem[liy+5][lix+2];
+    float b4 = lsmem[liy+5][lix+3];
+    float b5 = lsmem[liy+5][lix+4];
+    float b6 = lsmem[liy+5][lix+5];
+    float b7 = lsmem[liy+5][lix+6];
+
+    float bb1 = lsmem[liy+6][lix];
+    float bb2 = lsmem[liy+6][lix+1];
+    float bb3 = lsmem[liy+6][lix+2];
+    float bb4 = lsmem[liy+6][lix+3];
+    float bb5 = lsmem[liy+6][lix+4];
+    float bb6 = lsmem[liy+6][lix+5];
+    float bb7 = lsmem[liy+6][lix+6];
+
+    //calc and store dx and dy
+    DSTX(x,y) = scale *
+        mad(100.0f, m5 - m3,
+            mad(80.0f, m6 - m2,
+                mad(20.0f, m7 - m1,
+                    mad(75.0f, u5 - u3 + l5 - l3,
+                        mad(60.0f, u6 - u2 + l6 - l2,
+                            mad(15.0f, u7 - u1 + l7 - l1,
+                                mad(30.0f, t5 - t3 + b5 - b3,
+                                    mad(24.0f, t6 - t2 + b6 - b2,
+                                        mad(6.0f, t7 - t1 + b7 - b1,
+                                            mad(5.0f, tt5 - tt3 + bb5 - bb3,
+                                                mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 )
+                                                )
+                                            )
+                                        )
+                                    )
+                                )
+                            )
+                        )
+                    )
+                )
+            );
+
+    DSTY(x,y) = scale *
+        mad(100.0f, l4 - u4,
+            mad(80.0f, b4 - t4,
+                mad(20.0f, bb4 - tt4,
+                    mad(75.0f, l5 - u5 + l3 - u3,
+                        mad(60.0f, b5 - t5 + b3 - t3,
+                            mad(15.0f, bb5 - tt5 + bb3 - tt3,
+                                mad(30.0f, l6 - u6 + l2 - u2,
+                                    mad(24.0f, b6 - t6 + b2 - t2,
+                                        mad(6.0f, bb6 - tt6 + bb2 - tt2,
+                                            mad(5.0f, l7 - u7 + l1 - u1,
+                                                mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 )
+                                                )
+                                            )
+                                        )
+                                    )
+                                )
+                            )
+                        )
+                    )
+                )
+            );
+}
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
index 0d2f26f96..60d7346e5 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@@ -50,6 +50,14 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 
+#ifdef INTEL_DEVICE
+#define QANGLE_TYPE		int
+#define QANGLE_TYPE2	int2
+#else
+#define QANGLE_TYPE		uchar
+#define QANGLE_TYPE2	uchar2
+#endif
+
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel(
     const int cnbins, const int cblock_hist_size, const int img_block_width,
     const int blocks_in_group, const int blocks_total,
     const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
+    __global const float* grad, __global const QANGLE_TYPE* qangle,
     __global const float* gauss_w_lut,
     __global float* block_hists, __local float* smem)
 {
@@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel(
 
     __global const float* grad_ptr = (gid < blocks_total) ?
         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+    __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
 
     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
@@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel(
     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
     {
         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+        QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
 
         grad_ptr += grad_quadstep;
         qangle_ptr += qangle_step;
@@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel(
 __kernel void compute_gradients_8UC4_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
@@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel(
 __kernel void compute_gradients_8UC1_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 0472815bf..318f9e0a0 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -116,7 +116,7 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool
 
     Mat m(size0, type);
 
-    rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
+    rng.fill(m, RNG::UNIFORM, minVal, maxVal);
     if( size0 == size )
         return m;
     return m(Rect((size0.width-size.width)/2, (size0.height-size.height)/2, size.width, size.height));
@@ -142,7 +142,7 @@ Mat randomMat(RNG& rng, const vector<int>& size, int type, double minVal, double
 
     Mat m(dims, &size0[0], type);
 
-    rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
+    rng.fill(m, RNG::UNIFORM, minVal, maxVal);
     if( eqsize )
         return m;
     return m(&r[0]);
diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake
index c6b0469ad..2c5b7406d 100644
--- a/platforms/linux/arm-gnueabi.toolchain.cmake
+++ b/platforms/linux/arm-gnueabi.toolchain.cmake
@@ -28,14 +28,11 @@ set(CMAKE_MODULE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-s
 set(CMAKE_EXE_LINKER_FLAGS    "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_EXE_LINKER_FLAGS}")
 
 if(USE_NEON)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
+  message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." )
+  set(ENABLE_NEON TRUE)
 elseif(USE_VFPV3)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3")
-else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3-d16")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3-d16")
+  message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." )
+  set(ENABLE_VFPV3 TRUE)
 endif()
 
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT})
diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp
new file mode 100644
index 000000000..40349e0fb
--- /dev/null
+++ b/samples/cpp/intelperc_capture.cpp
@@ -0,0 +1,376 @@
+// testOpenCVCam.cpp : Defines the entry point for the console application.
+//
+
+#include "opencv2/highgui/highgui.hpp"
+
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+static bool g_printStreamSetting        = false;
+static int g_imageStreamProfileIdx      = -1;
+static int g_depthStreamProfileIdx      = -1;
+static bool g_irStreamShow              = false;
+static double g_imageBrightness         = -DBL_MAX;
+static double g_imageContrast           = -DBL_MAX;
+static bool g_printTiming               = false;
+static bool g_showClosedPoint           = false;
+
+
+static int g_closedDepthPoint[2];
+
+static void printUsage(const char *arg0)
+{
+    const char *filename = arg0;
+    while (*filename)
+        filename++;
+    while ((arg0 <= filename) && ('\\' != *filename) && ('/' != *filename))
+        filename--;
+    filename++;
+
+    cout << "This program demonstrates usage of camera supported\nby Intel Perceptual computing SDK." << endl << endl;
+    cout << "usage: " << filename << "[-ps] [-isp IDX] [-dsp IDX]\n [-ir] [-imb VAL] [-imc VAL]" << endl << endl;
+    cout << "   -ps,            print streams setting and profiles" << endl;
+    cout << "   -isp IDX,       set profile index of the image stream" << endl;
+    cout << "   -dsp IDX,       set profile index of the depth stream" << endl;
+    cout << "   -ir,            show data from IR stream" << endl;
+    cout << "   -imb VAL,       set brighness value for a image stream" << endl;
+    cout << "   -imc VAL,       set contrast value for a image stream" << endl;
+    cout << "   -pts,           print frame index and frame time" << endl;
+    cout << "   --show-closed,  print frame index and frame time" << endl;
+    cout <<  endl;
+}
+
+static void parseCMDLine(int argc, char* argv[])
+{
+    if( argc == 1 )
+    {
+        printUsage(argv[0]);
+    }
+    else
+    {
+        for( int i = 1; i < argc; i++ )
+        {
+            if ((0 == strcmp(argv[i], "--help")) || (0 == strcmp( argv[i], "-h")))
+            {
+                printUsage(argv[0]);
+                exit(0);
+            }
+            else if ((0 == strcmp( argv[i], "--print-streams")) || (0 == strcmp( argv[i], "-ps")))
+            {
+                g_printStreamSetting = true;
+            }
+            else if ((0 == strcmp( argv[i], "--image-stream-prof")) || (0 == strcmp( argv[i], "-isp")))
+            {
+                g_imageStreamProfileIdx = atoi(argv[++i]);
+            }
+            else if ((0 == strcmp( argv[i], "--depth-stream-prof")) || (0 == strcmp( argv[i], "-dsp")))
+            {
+                g_depthStreamProfileIdx = atoi(argv[++i]);
+            }
+            else if (0 == strcmp( argv[i], "-ir"))
+            {
+                g_irStreamShow = true;
+            }
+            else if (0 == strcmp( argv[i], "-imb"))
+            {
+                g_imageBrightness = atof(argv[++i]);
+            }
+            else if (0 == strcmp( argv[i], "-imc"))
+            {
+                g_imageContrast = atof(argv[++i]);
+            }
+            else if (0 == strcmp(argv[i], "-pts"))
+            {
+                g_printTiming = true;
+            }
+            else if (0 == strcmp(argv[i], "--show-closed"))
+            {
+                g_showClosedPoint = true;
+            }
+            else
+            {
+                cout << "Unsupported command line argument: " << argv[i] << "." << endl;
+                exit(-1);
+            }
+        }
+        if (g_showClosedPoint && (-1 == g_depthStreamProfileIdx))
+        {
+            cerr << "For --show-closed depth profile has be selected" << endl;
+            exit(-1);
+        }
+    }
+}
+
+static void printStreamProperties(VideoCapture &capture)
+{
+    size_t profilesCount = (size_t)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT);
+    cout << "Image stream." << endl;
+    cout << "  Brightness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS) << endl;
+    cout << "  Contrast = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_CONTRAST) << endl;
+    cout << "  Saturation = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SATURATION) << endl;
+    cout << "  Hue = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_HUE) << endl;
+    cout << "  Gamma = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAMMA) << endl;
+    cout << "  Sharpness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SHARPNESS) << endl;
+    cout << "  Gain = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAIN) << endl;
+    cout << "  Backligh = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BACKLIGHT) << endl;
+    cout << "Image streams profiles:" << endl;
+    for (size_t i = 0; i < profilesCount; i++)
+    {
+        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i);
+        cout << "  Profile[" << i << "]: ";
+        cout << "width = " <<
+            (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_WIDTH);
+        cout << ", height = " <<
+            (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_HEIGHT);
+        cout << ", fps = " <<
+            capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FPS);
+        cout << endl;
+    }
+
+    profilesCount = (size_t)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT);
+    cout << "Depth stream." << endl;
+    cout << "  Low confidence value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl;
+    cout << "  Saturation value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl;
+    cout << "  Confidence threshold = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl;
+    cout << "  Focal length = (" << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", "
+        << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl;
+    cout << "Depth streams profiles:" << endl;
+    for (size_t i = 0; i < profilesCount; i++)
+    {
+        capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i);
+        cout << "  Profile[" << i << "]: ";
+        cout << "width = " <<
+            (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_WIDTH);
+        cout << ", height = " <<
+            (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_HEIGHT);
+        cout << ", fps = " <<
+            capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FPS);
+        cout << endl;
+    }
+}
+
+static void imshowImage(const char *winname, Mat &image, VideoCapture &capture)
+{
+    if (g_showClosedPoint)
+    {
+        Mat uvMap;
+        if (capture.retrieve(uvMap, CAP_INTELPERC_UVDEPTH_MAP))
+        {
+            float *uvmap = (float *)uvMap.ptr() + 2 * (g_closedDepthPoint[0] * uvMap.cols + g_closedDepthPoint[1]);
+            int x = (int)((*uvmap) * image.cols); uvmap++;
+            int y = (int)((*uvmap) * image.rows);
+
+            if ((0 <= x) && (0 <= y))
+            {
+                static const int pointSize = 4;
+                for (int row = y; row < min(y + pointSize, image.rows); row++)
+                {
+                    uchar* ptrDst = image.ptr(row) + x * 3 + 2;//+2 -> Red
+                    for (int col = 0; col < min(pointSize, image.cols - x); col++, ptrDst+=3)
+                    {
+                        *ptrDst = 255;
+                    }
+                }
+            }
+        }
+    }
+    imshow(winname, image);
+}
+static void imshowIR(const char *winname, Mat &ir)
+{
+    Mat image;
+    if (g_showClosedPoint)
+    {
+        image.create(ir.rows, ir.cols, CV_8UC3);
+        for (int row = 0; row < ir.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)ir.ptr(row);
+            for (int col = 0; col < ir.cols; col++, ptrSrc++)
+            {
+                uchar val = (uchar) ((*ptrSrc) >> 2);
+                *ptrDst = val;  ptrDst++;
+                *ptrDst = val;  ptrDst++;
+                *ptrDst = val;  ptrDst++;
+            }
+        }
+
+        static const int pointSize = 4;
+        for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++)
+        {
+            uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red
+            for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3)
+            {
+                *ptrDst = 255;
+            }
+        }
+    }
+    else
+    {
+        image.create(ir.rows, ir.cols, CV_8UC1);
+        for (int row = 0; row < ir.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)ir.ptr(row);
+            for (int col = 0; col < ir.cols; col++, ptrSrc++, ptrDst++)
+            {
+                *ptrDst = (uchar) ((*ptrSrc) >> 2);
+            }
+        }
+    }
+
+    imshow(winname, image);
+}
+static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture)
+{
+    short lowValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE);
+    short saturationValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE);
+
+    Mat image;
+    if (g_showClosedPoint)
+    {
+        image.create(depth.rows, depth.cols, CV_8UC3);
+        for (int row = 0; row < depth.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)depth.ptr(row);
+            for (int col = 0; col < depth.cols; col++, ptrSrc++)
+            {
+                if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc)))
+                {
+                    *ptrDst = 0; ptrDst++;
+                    *ptrDst = 0; ptrDst++;
+                    *ptrDst = 0; ptrDst++;
+                }
+                else
+                {
+                    uchar val = (uchar) ((*ptrSrc) >> 2);
+                    *ptrDst = val;  ptrDst++;
+                    *ptrDst = val;  ptrDst++;
+                    *ptrDst = val;  ptrDst++;
+                }
+            }
+        }
+
+        static const int pointSize = 4;
+        for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++)
+        {
+            uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red
+            for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3)
+            {
+                *ptrDst = 255;
+            }
+        }
+    }
+    else
+    {
+        image.create(depth.rows, depth.cols, CV_8UC1);
+        for (int row = 0; row < depth.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)depth.ptr(row);
+            for (int col = 0; col < depth.cols; col++, ptrSrc++, ptrDst++)
+            {
+                if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc)))
+                    *ptrDst = 0;
+                else
+                    *ptrDst = (uchar) ((*ptrSrc) >> 2);
+            }
+        }
+    }
+    imshow(winname, image);
+}
+
+int main(int argc, char* argv[])
+{
+    parseCMDLine(argc, argv);
+
+    VideoCapture capture;
+    capture.open(CAP_INTELPERC);
+    if (!capture.isOpened())
+    {
+        cerr << "Can not open a capture object." << endl;
+        return -1;
+    }
+
+    if (g_printStreamSetting)
+        printStreamProperties(capture);
+
+    if (-1 != g_imageStreamProfileIdx)
+    {
+        if (!capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx))
+        {
+            cerr << "Can not setup a image stream." << endl;
+            return -1;
+        }
+    }
+    if (-1 != g_depthStreamProfileIdx)
+    {
+        if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx))
+        {
+            cerr << "Can not setup a depth stream." << endl;
+            return -1;
+        }
+    }
+    else if (g_irStreamShow)
+    {
+        if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, 0.0))
+        {
+            cerr << "Can not setup a IR stream." << endl;
+            return -1;
+        }
+    }
+    else
+    {
+        cout << "Streams not selected" << endl;
+        return 0;
+    }
+
+    //Setup additional properies only after set profile of the stream
+    if ( (-10000.0 < g_imageBrightness) && (g_imageBrightness < 10000.0))
+        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageBrightness);
+    if ( (0 < g_imageContrast) && (g_imageContrast < 10000.0))
+        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageContrast);
+
+    int frame = 0;
+    for(;;frame++)
+    {
+        Mat bgrImage;
+        Mat depthImage;
+        Mat irImage;
+
+        if (!capture.grab())
+        {
+            cout << "Can not grab images." << endl;
+            return -1;
+        }
+
+        if ((-1 != g_depthStreamProfileIdx) && (capture.retrieve(depthImage, CAP_INTELPERC_DEPTH_MAP)))
+        {
+            if (g_showClosedPoint)
+            {
+                double minVal = 0.0; double maxVal = 0.0;
+                minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint);
+            }
+            imshowDepth("depth image", depthImage, capture);
+        }
+        if ((g_irStreamShow) && (capture.retrieve(irImage, CAP_INTELPERC_IR_MAP)))
+            imshowIR("ir image", irImage);
+        if ((-1 != g_imageStreamProfileIdx) && (capture.retrieve(bgrImage, CAP_INTELPERC_IMAGE)))
+            imshowImage("color image", bgrImage, capture);
+
+        if (g_printTiming)
+        {
+            cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_FRAMES)
+                 << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_FRAMES) << endl;
+            cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_MSEC)
+                 << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_MSEC) << endl;
+        }
+        if( waitKey(30) >= 0 )
+            break;
+    }
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
index 480229b53..1c8dbd24a 100644
--- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
@@ -32,13 +32,13 @@ int main()
     for (int i = 0; i < image.rows; ++i)
         for (int j = 0; j < image.cols; ++j)
         {
-            Mat sampleMat = (Mat_<float>(1,2) << i,j);
+            Mat sampleMat = (Mat_<float>(1,2) << j,i);
             float response = SVM.predict(sampleMat);
 
             if (response == 1)
-                image.at<Vec3b>(j, i)  = green;
+                image.at<Vec3b>(i,j)  = green;
             else if (response == -1)
-                 image.at<Vec3b>(j, i)  = blue;
+                 image.at<Vec3b>(i,j)  = blue;
         }
 
     // Show the training data
diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp
index fd570b515..c5059323e 100644
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@@ -14,7 +14,10 @@
 
 using namespace std;
 using namespace cv;
+
 #define LOOP_NUM 1
+#define MAX_THREADS 10
+
 
 ///////////////////////////single-threading faces detecting///////////////////////////////
 
@@ -29,23 +32,23 @@ const static Scalar colors[] =  { CV_RGB(0,0,255),
                                 } ;
 
 
-int64 work_begin = 0;
-int64 work_end = 0;
+int64 work_begin[MAX_THREADS] = {0};
+int64 work_total[MAX_THREADS] = {0};
 string inputName, outputName, cascadeName;
 
-static void workBegin()
+static void workBegin(int i = 0)
 {
-    work_begin = getTickCount();
+    work_begin[i] = getTickCount();
 }
 
-static void workEnd()
+static void workEnd(int i = 0)
 {
-    work_end += (getTickCount() - work_begin);
+    work_total[i] += (getTickCount() - work_begin[i]);
 }
 
-static double getTime()
+static double getTotalTime(int i = 0)
 {
-    return work_end /((double)cvGetTickFrequency() * 1000.);
+    return work_total[i] /getTickFrequency() * 1000.;
 }
 
 
@@ -98,7 +101,6 @@ static int facedetect_one_thread(bool useCPU, double scale )
         }
     }
 
-    cvNamedWindow( "result", 1 );
     if( capture )
     {
         cout << "In capture ..." << endl;
@@ -118,7 +120,6 @@ static int facedetect_one_thread(bool useCPU, double scale )
             else
                 resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR);
 
-            work_end = 0;
             if(useCPU)
                 detectCPU(frameCopy, faces, cpu_cascade, 1);
             else
@@ -132,16 +133,16 @@ static int facedetect_one_thread(bool useCPU, double scale )
     }
     else
     {
-        cout << "In image read" << endl;
+        cout << "In image read " << image.size() << endl;
         vector<Rect> faces;
         vector<Rect> ref_rst;
         double accuracy = 0.;
         detectCPU(image, ref_rst, cpu_cascade, scale);
-        work_end = 0;
 
+        cout << "loops: ";
         for(int i = 0; i <= LOOP_NUM; i ++)
         {
-            cout << "loop" << i << endl;
+            cout << i << ", ";
             if(useCPU)
                 detectCPU(image, faces, cpu_cascade, scale);
             else
@@ -152,16 +153,15 @@ static int facedetect_one_thread(bool useCPU, double scale )
                     accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
                 }
             }
-            if (i == LOOP_NUM)
-            {
-                if (useCPU)
-                    cout << "average CPU time (noCamera) : ";
-                else
-                    cout << "average GPU time (noCamera) : ";
-                cout << getTime() / LOOP_NUM << " ms" << endl;
-                cout << "accuracy value: " << accuracy <<endl;
-            }
         }
+        cout << "done!" << endl;
+        if (useCPU)
+            cout << "average CPU time (noCamera) : ";
+        else
+            cout << "average GPU time (noCamera) : ";
+        cout << getTotalTime() / LOOP_NUM << " ms" << endl;
+        cout << "accuracy value: " << accuracy <<endl;
+
         Draw(image, faces, scale);
         waitKey(0);
     }
@@ -174,9 +174,7 @@ static int facedetect_one_thread(bool useCPU, double scale )
 ///////////////////////////////////////detectfaces with multithreading////////////////////////////////////////////
 #if defined(_MSC_VER) && (_MSC_VER >= 1700)
 
-#define MAX_THREADS 10
-
-static void detectFaces(std::string fileName)
+static void detectFaces(std::string fileName, int threadNum)
 {
     ocl::OclCascadeClassifier cascade;
     if(!cascade.load(cascadeName))
@@ -188,7 +186,7 @@ static void detectFaces(std::string fileName)
     Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR);
     if (img.empty())
     {
-        std::cout << "cann't open file " + fileName <<std::endl;
+        std::cout << '[' << threadNum << "] " << "can't open file " + fileName <<std::endl;
         return;
     }
 
@@ -196,23 +194,37 @@ static void detectFaces(std::string fileName)
     d_img.upload(img);
 
     std::vector<Rect> oclfaces;
-    cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0 | CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0));
+    std::thread::id tid = std::this_thread::get_id();
+    std::cout << '[' << threadNum << "] "
+        << "ThreadID = " << tid
+        << ", CommandQueue = " << *(void**)ocl::getClCommandQueuePtr()
+        << endl;
+    for(int i = 0; i <= LOOP_NUM; i++)
+    {
+        if(i>0) workBegin(threadNum);
+        cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0|CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0));
+        if(i>0) workEnd(threadNum);
+    }
+    std::cout << '[' << threadNum << "] " << "Average time = " << getTotalTime(threadNum) / LOOP_NUM << " ms" << endl;
 
     for(unsigned int i = 0; i<oclfaces.size(); i++)
         rectangle(img, Point(oclfaces[i].x, oclfaces[i].y), Point(oclfaces[i].x + oclfaces[i].width, oclfaces[i].y + oclfaces[i].height), colors[i%8], 3);
 
     std::string::size_type pos = outputName.rfind('.');
-    std::string outputNameTid = outputName + '-' + std::to_string(_threadid);
-    if(pos == std::string::npos)
+    std::string strTid = std::to_string(_threadid);
+    if( !outputName.empty() )
     {
-        std::cout << "Invalid output file name: " << outputName << std::endl;
+        if(pos == std::string::npos)
+        {
+            std::cout << "Invalid output file name: " << outputName << std::endl;
+        }
+        else
+        {
+            std::string outputNameTid = outputName.substr(0, pos) + "_" + strTid + outputName.substr(pos);
+            imwrite(outputNameTid, img);
+        }
     }
-    else
-    {
-        outputNameTid = outputName.substr(0, pos) + "_" + std::to_string(_threadid) + outputName.substr(pos);
-        imwrite(outputNameTid, img);
-    }
-    imshow(outputNameTid, img);
+    imshow(strTid, img);
     waitKey(0);
 }
 
@@ -221,7 +233,7 @@ static void facedetect_multithreading(int nthreads)
     int thread_number = MAX_THREADS < nthreads ? MAX_THREADS : nthreads;
     std::vector<std::thread> threads;
     for(int i = 0; i<thread_number; i++)
-        threads.push_back(std::thread(detectFaces, inputName));
+        threads.push_back(std::thread(detectFaces, inputName, i));
     for(int i = 0; i<thread_number; i++)
         threads[i].join();
 }
@@ -237,8 +249,7 @@ int main( int argc, const char** argv )
         " specify template file path }"
         "{ c scale      |   1.0       | scale image }"
         "{ s use_cpu    | false       | use cpu or gpu to process the image }"
-        "{ o output     | facedetect_output.jpg  |"
-        " specify output image save path(only works when input is images) }"
+        "{ o output     | | specify output image save path(only works when input is images) }"
         "{ n thread_num |      1      | set number of threads >= 1 }";
 
     CommandLineParser cmd(argc, argv, keys);
@@ -312,8 +323,6 @@ void detectCPU( Mat& img, vector<Rect>& faces,
 void Draw(Mat& img, vector<Rect>& faces, double scale)
 {
     int i = 0;
-    putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50),
-            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
     for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
     {
         Point center;
@@ -324,8 +333,8 @@ void Draw(Mat& img, vector<Rect>& faces, double scale)
         radius = cvRound((r->width + r->height)*0.25*scale);
         circle( img, center, radius, color, 3, 8, 0 );
     }
-    //imwrite( outputName, img );
-    if(abs(scale-1.0)>.001)
+    //if( !outputName.empty() ) imwrite( outputName, img );
+    if( abs(scale-1.0)>.001 )
     {
         resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale)));
     }