Merge pull request #2045 from SpecLad:merge-2.4

2013-12-25 16:18:00 +04:00 · 2013-12-25 16:18:00 +04:00 · faddd5b97f
commit faddd5b97f
parent 0966e5ffa1 3c4b24d531
41 changed files with 3627 additions and 952 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -156,6 +156,7 @@ OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              ON
 OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_DIRECTX        "Include DirectX support"                     ON   IF WIN32 )
 OCV_OPTION(WITH_INTELPERC      "Include Intel Perceptual Computing support"  OFF  IF WIN32 )
 # OpenCV build components
@ -207,7 +208,8 @@ OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"
 OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND ARM) )
+OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF CMAKE_COMPILER_IS_GNUCXX AND ARM )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND ARM )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
 OCV_OPTION(ENABLE_WINRT_MODE          "Build with Windows Runtime support"                       OFF  IF WIN32 )
@ -226,6 +228,15 @@ include(cmake/OpenCVVersion.cmake)
 # Save libs and executables in the same place
 set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" )
 if (ANDROID)
  if (ANDROID_ABI MATCHES "NEON")
    set(ENABLE_NEON ON)
  endif()
  if (ANDROID_ABI MATCHES "VFPV3")
    set(ENABLE_VFPV3 ON)
  endif()
 endif()
 if(ANDROID OR WIN32)
  set(OPENCV_DOC_INSTALL_PATH doc)
 elseif(INSTALL_TO_MANGLED_PATHS)
@ -820,6 +831,11 @@ if(DEFINED WITH_XINE)
  status("    Xine:"           HAVE_XINE           THEN "YES (ver ${ALIASOF_libxine_VERSION})"     ELSE NO)
 endif(DEFINED WITH_XINE)
 if(DEFINED WITH_INTELPERC)
  status("    Intel PerC:"     HAVE_INTELPERC      THEN "YES"                                 ELSE NO)
 endif(DEFINED WITH_INTELPERC)
 # ========================== Other third-party libraries ==========================
 status("")
 status("  Other third-party libraries:")
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -124,6 +124,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  if(ENABLE_SSE2)
    add_extra_compiler_option(-msse2)
  endif()
  if (ENABLE_NEON)
    add_extra_compiler_option("-mfpu=neon")
  endif()
  if (ENABLE_VFPV3 AND NOT ENABLE_NEON)
    add_extra_compiler_option("-mfpu=vfpv3")
  endif()
  # SSE3 and further should be disabled under MingW because it generates compiler errors
  if(NOT MINGW)
--- a/cmake/OpenCVFindIntelPerCSDK.cmake
+++ b/cmake/OpenCVFindIntelPerCSDK.cmake
@ -0,0 +1,20 @@
 # Main variables:
 # INTELPERC_LIBRARIES and INTELPERC_INCLUDE to link Intel Perceptial Computing SDK modules
 # HAVE_INTELPERC for conditional compilation OpenCV with/without Intel Perceptial Computing SDK
 if(X86_64)
    find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers")
    find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/x64" DOC "Path to Intel Perceptual Computing SDK interface libraries")
 else()
    find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers")
    find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Path to Intel Perceptual Computing SDK interface libraries")
 endif()
 if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES)
    set(HAVE_INTELPERC TRUE)
 else()
    set(HAVE_INTELPERC FALSE)
    message(WARNING "Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries.")
 endif() #if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES)
 mark_as_advanced(FORCE INTELPERC_LIBRARIES INTELPERC_INCLUDE_DIR)
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -277,3 +277,8 @@ if (NOT IOS)
    set(HAVE_QTKIT YES)
  endif()
 endif()
 # --- Intel Perceptual Computing SDK ---
 if(WITH_INTELPERC)
  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake")
 endif(WITH_INTELPERC)
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@ -88,6 +88,9 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #cmakedefine HAVE_INTTYPES_H 1
 /* Intel Perceptual Computing SDK library */
 #cmakedefine HAVE_INTELPERC
 /* Intel Integrated Performance Primitives */
 #cmakedefine HAVE_IPP
--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
@ -106,8 +106,8 @@ Enable hardware optimizations
 -----------------------------
 Depending on target platform architecture different instruction sets can be used. By default
-compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON``
+compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DENABLE_VFPV3=ON``
-to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using
+to cmake command line to enable code generation for VFPv3 and ``-DENABLE_NEON=ON`` for using
 NEON SIMD extensions.
 TBB is supported on multi core ARM SoCs also.
--- a/doc/user_guide/ug_intelperc.rst
+++ b/doc/user_guide/ug_intelperc.rst
@ -0,0 +1,79 @@
 *******
 HighGUI
 *******
 .. highlight:: cpp
 Using Creative Senz3D and other Intel Perceptual Computing SDK compatible depth sensors
 =======================================================================================
 Depth sensors compatible with Intel Perceptual Computing SDK are supported through ``VideoCapture`` class. Depth map, RGB image and some other formats of output can be retrieved by using familiar interface of ``VideoCapture``.
 In order to use depth sensor with OpenCV you should do the following preliminary steps:
 #.
    Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual).
 #.
    Configure OpenCV with Intel Perceptual Computing SDK support by setting ``WITH_INTELPERC`` flag in CMake. If Intel Perceptual Computing SDK is found in install folders OpenCV will be built with Intel Perceptual Computing SDK library (see a status ``INTELPERC`` in CMake log). If CMake process doesn't find Intel Perceptual Computing SDK installation folder automatically, the user should change corresponding CMake variables ``INTELPERC_LIB_DIR`` and ``INTELPERC_INCLUDE_DIR`` to the proper value.
 #.
    Build OpenCV.
 VideoCapture can retrieve the following data:
 #.
    data given from depth generator:
      * ``CV_CAP_INTELPERC_DEPTH_MAP``       - each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. (CV_16UC1)
      * ``CV_CAP_INTELPERC_UVDEPTH_MAP``     - each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. (CV_32FC2)
      * ``CV_CAP_INTELPERC_IR_MAP``          - each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. (CV_16UC1)
 #.
    data given from RGB image generator:
      * ``CV_CAP_INTELPERC_IMAGE``           - color image. (CV_8UC3)
 In order to get depth map from depth sensor use ``VideoCapture::operator >>``, e. g. ::
    VideoCapture capture( CV_CAP_INTELPERC );
    for(;;)
    {
        Mat depthMap;
        capture >> depthMap;
        if( waitKey( 30 ) >= 0 )
            break;
    }
 For getting several data maps use ``VideoCapture::grab`` and ``VideoCapture::retrieve``, e.g. ::
    VideoCapture capture(CV_CAP_INTELPERC);
    for(;;)
    {
        Mat depthMap;
        Mat image;
        Mat irImage;
        capture.grab();
        capture.retrieve( depthMap, CV_CAP_INTELPERC_DEPTH_MAP );
        capture.retrieve(    image, CV_CAP_INTELPERC_IMAGE );
        capture.retrieve(  irImage, CV_CAP_INTELPERC_IR_MAP);
        if( waitKey( 30 ) >= 0 )
            break;
    }
 For setting and getting some property of sensor` data generators use ``VideoCapture::set`` and ``VideoCapture::get`` methods respectively, e.g. ::
    VideoCapture capture( CV_CAP_INTELPERC );
    capture.set( CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0 );
    cout << "FPS    " << capture.get( CV_CAP_INTELPERC_DEPTH_GENERATOR+CV_CAP_PROP_FPS ) << endl;
 Since two types of sensor's data generators are supported (image generator and depth generator), there are two flags that should be used to set/get property of the needed generator:
 * CV_CAP_INTELPERC_IMAGE_GENERATOR -- a flag for access to the image generator properties.
 * CV_CAP_INTELPERC_DEPTH_GENERATOR -- a flag for access to the depth generator properties. This flag value is assumed by default if neither of the two possible values of the property is set.
 For more information please refer to the example of usage intelperc_capture.cpp_ in ``opencv/samples/cpp`` folder.
 .. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp
--- a/doc/user_guide/user_guide.rst
+++ b/doc/user_guide/user_guide.rst
@ -9,3 +9,4 @@ OpenCV User Guide
   ug_features2d.rst
   ug_highgui.rst
   ug_traincascade.rst
   ug_intelperc.rst
--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@ -903,7 +903,7 @@ So, the function chooses an operation mode depending on the flags and size of th
    * When ``DFT_COMPLEX_OUTPUT`` is set, the output is a complex matrix of the same size as input.
-    * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DCT_ROWS``         flag), each row of the output matrix looks like the first row of the matrix above.
+    * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DFT_ROWS``         flag), each row of the output matrix looks like the first row of the matrix above.
 * If the input array is complex and either ``DFT_INVERSE``     or ``DFT_REAL_OUTPUT``     are not set, the output is a complex array of the same size as input. The function performs a forward or inverse 1D or 2D transform of the whole input array or each row of the input array independently, depending on the flags ``DFT_INVERSE`` and ``DFT_ROWS``.
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@ -2577,7 +2577,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
    DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2];
-    if( (flags & DFT_ROWS) || src.rows == 1 ||
+    if( (flags & DCT_ROWS) || src.rows == 1 ||
        (src.cols == 1 && (src.isContinuous() && dst.isContinuous())))
    {
        stage = end_stage = 0;
@ -2597,7 +2597,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
        {
            len = src.cols;
            count = src.rows;
-            if( len == 1 && !(flags & DFT_ROWS) )
+            if( len == 1 && !(flags & DCT_ROWS) )
            {
                len = src.rows;
                count = 1;
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -2760,39 +2760,24 @@ void cv::transpose( InputArray _src, OutputArray _dst )
 }
 ////////////////////////////////////// completeSymm /////////////////////////////////////////
 void cv::completeSymm( InputOutputArray _m, bool LtoR )
 {
    Mat m = _m.getMat();
-    CV_Assert( m.dims <= 2 );
+    size_t step = m.step, esz = m.elemSize();
    CV_Assert( m.dims <= 2 && m.rows == m.cols );
-    int i, j, nrows = m.rows, type = m.type();
+    int rows = m.rows;
-    int j0 = 0, j1 = nrows;
+    int j0 = 0, j1 = rows;
    CV_Assert( m.rows == m.cols );
-    if( type == CV_32FC1 || type == CV_32SC1 )
+    uchar* data = m.data;
    for( int i = 0; i < rows; i++ )
    {
-        int* data = (int*)m.data;
+        if( !LtoR ) j1 = i; else j0 = i+1;
-        size_t step = m.step/sizeof(data[0]);
+        for( int j = j0; j < j1; j++ )
-        for( i = 0; i < nrows; i++ )
+            memcpy(data + (i*step + j*esz), data + (j*step + i*esz), esz);
        {
            if( !LtoR ) j1 = i; else j0 = i+1;
            for( j = j0; j < j1; j++ )
                data[i*step + j] = data[j*step + i];
        }
    }
    else if( type == CV_64FC1 )
    {
        double* data = (double*)m.data;
        size_t step = m.step/sizeof(data[0]);
        for( i = 0; i < nrows; i++ )
        {
            if( !LtoR ) j1 = i; else j0 = i+1;
            for( j = j0; j < j1; j++ )
                data[i*step + j] = data[j*step + i];
        }
    }
    else
        CV_Error( CV_StsUnsupportedFormat, "" );
 }
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@ -222,6 +222,12 @@ elseif(HAVE_QTKIT)
  list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit")
 endif()
 if(HAVE_INTELPERC)
  list(APPEND highgui_srcs src/cap_intelperc.cpp)
  ocv_include_directories(${INTELPERC_INCLUDE_DIR})
  list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARIES})
 endif(HAVE_INTELPERC)
 if(IOS)
  add_definitions(-DHAVE_IOS=1)
  list(APPEND highgui_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@ -271,7 +271,8 @@ enum { CAP_ANY          = 0,     // autodetect
       CAP_XIAPI        = 1100,  // XIMEA Camera API
       CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
       CAP_GIGANETIX    = 1300,  // Smartek Giganetix GigEVisionSDK
-       CAP_MSMF         = 1400   // Microsoft Media Foundation (via videoInput)
+       CAP_MSMF         = 1400,  // Microsoft Media Foundation (via videoInput)
       CAP_INTELPERC    = 1500   // Intel Perceptual Computing SDK
     };
 // generic properties (based on DC1394 properties)
@ -496,6 +497,26 @@ enum { CAP_PROP_GIGA_FRAME_OFFSET_X   = 10001,
       CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
     };
 enum { CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
       CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
       CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
       CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
       CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007
     };
 // Intel PerC streams
 enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
       CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
       CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR
     };
 enum { CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
       CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
       CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
       CAP_INTELPERC_IMAGE                  = 3
     };
 class CV_EXPORTS_W VideoCapture
 {
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@ -313,7 +313,9 @@ enum
    CV_CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
-    CV_CAP_GIGANETIX = 1300  // Smartek Giganetix GigEVisionSDK
+    CV_CAP_GIGANETIX = 1300,  // Smartek Giganetix GigEVisionSDK
    CV_CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK
 };
 /* start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*) */
@ -459,16 +461,29 @@ enum
    CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002,
    CV_CAP_PROP_IOS_DEVICE_FLASH = 9003,
    CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004,
-    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005
+    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005,
    // Properties of cameras available through Smartek Giganetix Ethernet Vision interface
    /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
-    ,CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
+    CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
    CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002,
    CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003,
    CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004,
    CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005,
-    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
+    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006,
    CV_CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
    CV_CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
    CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
    CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
    CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007,
    // Intel PerC streams
    CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
    CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
    CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR
 };
 enum
@ -549,6 +564,14 @@ enum
    CV_CAP_ANDROID_ANTIBANDING_OFF
 };
 enum
 {
    CV_CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
    CV_CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
    CV_CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
    CV_CAP_INTELPERC_IMAGE                  = 3
 };
 /* retrieve or set capture properties */
 CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id );
 CVAPI(int)    cvSetCaptureProperty( CvCapture* capture, int property_id, double value );
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@ -155,6 +155,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 #endif
 #ifdef HAVE_GIGE_API
        CV_CAP_GIGANETIX,
 #endif
 #ifdef HAVE_INTELPERC
        CV_CAP_INTELPERC,
 #endif
        -1
    };
@ -193,6 +196,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
    defined(HAVE_AVFOUNDATION) || \
    defined(HAVE_ANDROID_NATIVE_CAMERA) || \
    defined(HAVE_GIGE_API) || \
    defined(HAVE_INTELPERC)    || \
    (0)
        // local variable to memorize the captured device
        CvCapture *capture;
@ -342,6 +346,13 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
        break; // CV_CAP_GIGANETIX
 #endif
 #ifdef HAVE_INTELPERC
        case CV_CAP_INTELPERC:
            capture = cvCreateCameraCapture_IntelPerC(index);
            if (capture)
                return capture;
        break; // CV_CAP_INTEL_PERC
 #endif
        }
    }
--- a/modules/highgui/src/cap_intelperc.cpp
+++ b/modules/highgui/src/cap_intelperc.cpp
@ -0,0 +1,714 @@
 #include "precomp.hpp"
 #ifdef HAVE_INTELPERC
 #include "pxcsession.h"
 #include "pxcsmartptr.h"
 #include "pxccapture.h"
 class CvIntelPerCStreamBase
 {
 protected:
    struct FrameInternal
    {
        IplImage* retrieveFrame()
        {
            if (m_mat.empty())
                return NULL;
            m_iplHeader = IplImage(m_mat);
            return &m_iplHeader;
        }
        cv::Mat m_mat;
    private:
        IplImage m_iplHeader;
    };
 public:
    CvIntelPerCStreamBase()
        : m_profileIdx(-1)
        , m_frameIdx(0)
        , m_timeStampStartNS(0)
    {
    }
    virtual ~CvIntelPerCStreamBase()
    {
    }
    bool isValid()
    {
        return (m_device.IsValid() && m_stream.IsValid());
    }
    bool grabFrame()
    {
        if (!m_stream.IsValid())
            return false;
        if (-1 == m_profileIdx)
        {
            if (!setProperty(CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0))
                return false;
        }
        PXCSmartPtr<PXCImage> pxcImage; PXCSmartSP sp;
        if (PXC_STATUS_NO_ERROR > m_stream->ReadStreamAsync(&pxcImage, &sp))
            return false;
        if (PXC_STATUS_NO_ERROR > sp->Synchronize())
            return false;
        if (0 == m_timeStampStartNS)
            m_timeStampStartNS = pxcImage->QueryTimeStamp();
        m_timeStamp = (double)((pxcImage->QueryTimeStamp() - m_timeStampStartNS) / 10000);
        m_frameIdx++;
        return prepareIplImage(pxcImage);
    }
    int getProfileIDX() const
    {
        return m_profileIdx;
    }
 public:
    virtual bool initStream(PXCSession *session)            = 0;
    virtual double getProperty(int propIdx)
    {
        double ret = 0.0;
        switch (propIdx)
        {
        case CV_CAP_PROP_INTELPERC_PROFILE_COUNT:
            ret = (double)m_profiles.size();
            break;
        case CV_CAP_PROP_FRAME_WIDTH :
            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
                ret = (double)m_profiles[m_profileIdx].imageInfo.width;
            break;
        case CV_CAP_PROP_FRAME_HEIGHT :
            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
                ret = (double)m_profiles[m_profileIdx].imageInfo.height;
            break;
        case CV_CAP_PROP_FPS :
            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
            {
                ret = ((double)m_profiles[m_profileIdx].frameRateMin.numerator / (double)m_profiles[m_profileIdx].frameRateMin.denominator
                        + (double)m_profiles[m_profileIdx].frameRateMax.numerator / (double)m_profiles[m_profileIdx].frameRateMax.denominator) / 2.0;
            }
            break;
        case CV_CAP_PROP_POS_FRAMES:
            ret  = (double)m_frameIdx;
            break;
        case CV_CAP_PROP_POS_MSEC:
            ret  = m_timeStamp;
            break;
        };
        return ret;
    }
    virtual bool setProperty(int propIdx, double propVal)
    {
        bool isSet = false;
        switch (propIdx)
        {
        case CV_CAP_PROP_INTELPERC_PROFILE_IDX:
            {
                int propValInt = (int)propVal;
                if ((0 <= propValInt) && (propValInt < m_profiles.size()))
                {
                    if (m_profileIdx != propValInt)
                    {
                        m_profileIdx = propValInt;
                        if (m_stream.IsValid())
                            m_stream->SetProfile(&m_profiles[m_profileIdx]);
                        m_frameIdx = 0;
                        m_timeStampStartNS = 0;
                    }
                    isSet = true;
                }
            }
            break;
        };
        return isSet;
    }
 protected:
    PXCSmartPtr<PXCCapture::Device> m_device;
    bool initDevice(PXCSession *session)
    {
        if (NULL == session)
            return false;
        pxcStatus sts = PXC_STATUS_NO_ERROR;
        PXCSession::ImplDesc templat;
        memset(&templat,0,sizeof(templat));
        templat.group   = PXCSession::IMPL_GROUP_SENSOR;
        templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE;
        for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++)
        {
            PXCSession::ImplDesc desc;
            sts = session->QueryImpl(&templat, modidx, &desc);
            if (PXC_STATUS_NO_ERROR > sts)
                break;
            PXCSmartPtr<PXCCapture> capture;
            sts = session->CreateImpl<PXCCapture>(&desc, &capture);
            if (!capture.IsValid())
                continue;
            /* enumerate devices */
            for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++)
            {
                PXCSmartPtr<PXCCapture::Device> device;
                sts = capture->CreateDevice(devidx, &device);
                if (PXC_STATUS_NO_ERROR <= sts)
                {
                    m_device = device.ReleasePtr();
                    return true;
                }
            }
        }
        return false;
    }
    PXCSmartPtr<PXCCapture::VideoStream> m_stream;
    void initStreamImpl(PXCImage::ImageType type)
    {
        if (!m_device.IsValid())
            return;
        pxcStatus sts = PXC_STATUS_NO_ERROR;
        /* enumerate streams */
        for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++)
        {
            PXCCapture::Device::StreamInfo sinfo;
            sts = m_device->QueryStream(streamidx, &sinfo);
            if (PXC_STATUS_NO_ERROR > sts)
                break;
            if (PXCCapture::VideoStream::CUID != sinfo.cuid)
                continue;
            if (type != sinfo.imageType)
                continue;
            sts = m_device->CreateStream<PXCCapture::VideoStream>(streamidx, &m_stream);
            if (PXC_STATUS_NO_ERROR == sts)
                break;
            m_stream.ReleaseRef();
        }
    }
 protected:
    std::vector<PXCCapture::VideoStream::ProfileInfo> m_profiles;
    int m_profileIdx;
    int m_frameIdx;
    pxcU64 m_timeStampStartNS;
    double m_timeStamp;
    virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& /*pinfo*/)
    {
        return true;
    }
    void enumProfiles()
    {
        m_profiles.clear();
        if (!m_stream.IsValid())
            return;
        pxcStatus sts = PXC_STATUS_NO_ERROR;
        for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++)
        {
            PXCCapture::VideoStream::ProfileInfo pinfo;
            sts = m_stream->QueryProfile(profidx, &pinfo);
            if (PXC_STATUS_NO_ERROR > sts)
                break;
            if (validProfile(pinfo))
                m_profiles.push_back(pinfo);
        }
    }
    virtual bool prepareIplImage(PXCImage *pxcImage) = 0;
 };
 class CvIntelPerCStreamImage
    : public CvIntelPerCStreamBase
 {
 public:
    CvIntelPerCStreamImage()
    {
    }
    virtual ~CvIntelPerCStreamImage()
    {
    }
    virtual bool initStream(PXCSession *session)
    {
        if (!initDevice(session))
            return false;
        initStreamImpl(PXCImage::IMAGE_TYPE_COLOR);
        if (!m_stream.IsValid())
            return false;
        enumProfiles();
        return true;
    }
    virtual double getProperty(int propIdx)
    {
        switch (propIdx)
        {
        case CV_CAP_PROP_BRIGHTNESS:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_CONTRAST:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_SATURATION:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_HUE:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_GAMMA:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_SHARPNESS:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_GAIN:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_BACKLIGHT:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_EXPOSURE:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        //Add image stream specific properties
        }
        return CvIntelPerCStreamBase::getProperty(propIdx);
    }
    virtual bool setProperty(int propIdx, double propVal)
    {
        switch (propIdx)
        {
        case CV_CAP_PROP_BRIGHTNESS:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, (float)propVal));
            }
            break;
        case CV_CAP_PROP_CONTRAST:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, (float)propVal));
            }
            break;
        case CV_CAP_PROP_SATURATION:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, (float)propVal));
            }
            break;
        case CV_CAP_PROP_HUE:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, (float)propVal));
            }
            break;
        case CV_CAP_PROP_GAMMA:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, (float)propVal));
            }
            break;
        case CV_CAP_PROP_SHARPNESS:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, (float)propVal));
            }
            break;
        case CV_CAP_PROP_GAIN:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, (float)propVal));
            }
            break;
        case CV_CAP_PROP_BACKLIGHT:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, (float)propVal));
            }
            break;
        case CV_CAP_PROP_EXPOSURE:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, (float)propVal));
            }
            break;
        //Add image stream specific properties
        }
        return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
    }
 public:
    IplImage* retrieveFrame()
    {
        return m_frame.retrieveFrame();
    }
 protected:
    FrameInternal m_frame;
    bool prepareIplImage(PXCImage *pxcImage)
    {
        if (NULL == pxcImage)
            return false;
        PXCImage::ImageInfo info;
        pxcImage->QueryInfo(&info);
        PXCImage::ImageData data;
        pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data);
        if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
            return false;
        cv::Mat temp(info.height, info.width, CV_8UC3, data.planes[0], data.pitches[0]);
        temp.copyTo(m_frame.m_mat);
        pxcImage->ReleaseAccess(&data);
        return true;
    }
 };
 class CvIntelPerCStreamDepth
    : public CvIntelPerCStreamBase
 {
 public:
    CvIntelPerCStreamDepth()
    {
    }
    virtual ~CvIntelPerCStreamDepth()
    {
    }
    virtual bool initStream(PXCSession *session)
    {
        if (!initDevice(session))
            return false;
        initStreamImpl(PXCImage::IMAGE_TYPE_DEPTH);
        if (!m_stream.IsValid())
            return false;
        enumProfiles();
        return true;
    }
    virtual double getProperty(int propIdx)
    {
        switch (propIdx)
        {
        case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
            {
                if (!m_device.IsValid())
                    return 0.0;
                float fret = 0.0f;
                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, &fret))
                    return (double)fret;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ:
            {
                if (!m_device.IsValid())
                    return 0.0f;
                PXCPointF32 ptf;
                if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
                    return (double)ptf.x;
                return 0.0;
            }
            break;
        case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT:
            {
                if (!m_device.IsValid())
                    return 0.0f;
                PXCPointF32 ptf;
                if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
                    return (double)ptf.y;
                return 0.0;
            }
            break;
            //Add depth stream sepcific properties
        }
        return CvIntelPerCStreamBase::getProperty(propIdx);
    }
    virtual bool setProperty(int propIdx, double propVal)
    {
        switch (propIdx)
        {
        case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, (float)propVal));
            }
            break;
        case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, (float)propVal));
            }
            break;
        case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
            {
                if (!m_device.IsValid())
                    return false;
                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, (float)propVal));
            }
            break;
        //Add depth stream sepcific properties
        }
        return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
    }
 public:
    IplImage* retrieveDepthFrame()
    {
        return m_frameDepth.retrieveFrame();
    }
    IplImage* retrieveIRFrame()
    {
        return m_frameIR.retrieveFrame();
    }
    IplImage* retrieveUVFrame()
    {
        return m_frameUV.retrieveFrame();
    }
 protected:
    virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo)
    {
        return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format);
    }
 protected:
    FrameInternal m_frameDepth;
    FrameInternal m_frameIR;
    FrameInternal m_frameUV;
    bool prepareIplImage(PXCImage *pxcImage)
    {
        if (NULL == pxcImage)
            return false;
        PXCImage::ImageInfo info;
        pxcImage->QueryInfo(&info);
        PXCImage::ImageData data;
        pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data);
        if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
            return false;
        if (PXCImage::COLOR_FORMAT_DEPTH != data.format)
            return false;
        {
            cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[0], data.pitches[0]);
            temp.copyTo(m_frameDepth.m_mat);
        }
        {
            cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[1], data.pitches[1]);
            temp.copyTo(m_frameIR.m_mat);
        }
        {
            cv::Mat temp(info.height, info.width, CV_32FC2, data.planes[2], data.pitches[2]);
            temp.copyTo(m_frameUV.m_mat);
        }
        pxcImage->ReleaseAccess(&data);
        return true;
    }
 };
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 class CvCapture_IntelPerC : public CvCapture
 {
 public:
    CvCapture_IntelPerC(int /*index*/)
        : m_contextOpened(false)
    {
        pxcStatus sts = PXCSession_Create(&m_session);
        if (PXC_STATUS_NO_ERROR > sts)
            return;
        m_contextOpened = m_imageStream.initStream(m_session);
        m_contextOpened &= m_depthStream.initStream(m_session);
    }
    virtual ~CvCapture_IntelPerC(){}
    virtual double getProperty(int propIdx)
    {
        double propValue = 0;
        int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
        if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
        {
            propValue = m_imageStream.getProperty(purePropIdx);
        }
        else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
        {
            propValue = m_depthStream.getProperty(purePropIdx);
        }
        else
        {
            propValue = m_depthStream.getProperty(purePropIdx);
        }
        return propValue;
    }
    virtual bool setProperty(int propIdx, double propVal)
    {
        bool isSet = false;
        int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
        if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
        {
            isSet = m_imageStream.setProperty(purePropIdx, propVal);
        }
        else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
        {
            isSet = m_depthStream.setProperty(purePropIdx, propVal);
        }
        else
        {
            isSet = m_depthStream.setProperty(purePropIdx, propVal);
        }
        return isSet;
    }
    bool grabFrame()
    {
        if (!isOpened())
            return false;
        bool isGrabbed = false;
        if (m_depthStream.isValid())
            isGrabbed = m_depthStream.grabFrame();
        if ((m_imageStream.isValid()) && (-1 != m_imageStream.getProfileIDX()))
            isGrabbed &= m_imageStream.grabFrame();
        return isGrabbed;
    }
    virtual IplImage* retrieveFrame(int outputType)
    {
        IplImage* image = 0;
        switch (outputType)
        {
        case CV_CAP_INTELPERC_DEPTH_MAP:
            image = m_depthStream.retrieveDepthFrame();
            break;
        case CV_CAP_INTELPERC_UVDEPTH_MAP:
            image = m_depthStream.retrieveUVFrame();
            break;
        case CV_CAP_INTELPERC_IR_MAP:
            image = m_depthStream.retrieveIRFrame();
            break;
        case CV_CAP_INTELPERC_IMAGE:
            image = m_imageStream.retrieveFrame();
            break;
        }
        CV_Assert(NULL != image);
        return image;
    }
    bool isOpened() const
    {
        return m_contextOpened;
    }
 protected:
    bool m_contextOpened;
    PXCSmartPtr<PXCSession> m_session;
    CvIntelPerCStreamImage m_imageStream;
    CvIntelPerCStreamDepth m_depthStream;
 };
 CvCapture* cvCreateCameraCapture_IntelPerC(int index)
 {
    CvCapture_IntelPerC* capture = new CvCapture_IntelPerC(index);
    if( capture->isOpened() )
        return capture;
    delete capture;
    return 0;
 }
 #endif //HAVE_INTELPERC
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@ -128,6 +128,7 @@ CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
 CvCapture* cvCreateCameraCapture_Android( int index );
 CvCapture* cvCreateCameraCapture_XIMEA( int index );
 CvCapture* cvCreateCameraCapture_AVFoundation(int index);
 CvCapture* cvCreateCameraCapture_IntelPerC(int index);
 CVAPI(int) cvHaveImageReader(const char* filename);
--- a/modules/highgui/test/test_precomp.hpp
+++ b/modules/highgui/test/test_precomp.hpp
@ -35,6 +35,7 @@
    defined(HAVE_XIMEA)        || \
    defined(HAVE_AVFOUNDATION) || \
    defined(HAVE_GIGE_API)     || \
    defined(HAVE_INTELPERC)    || \
    (0)
    //defined(HAVE_ANDROID_NATIVE_CAMERA) ||   - enable after #1193
 #  define BUILD_WITH_CAMERA_SUPPORT 1
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -3299,7 +3299,10 @@ public:
                    if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
                    {
                        bufxy = (*m1)(Rect(x, y, bcols, brows));
-                        bufa = (*m2)(Rect(x, y, bcols, brows));
+
                        const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
                        for( x1 = 0; x1 < bcols; x1++ )
                            A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
                    }
                    else if( planar_input )
                    {
@ -3680,7 +3683,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
        {
            for( x = 0; x < size.width; x++ )
            {
-                int fxy = src2 ? src2[x] : 0;
+                int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
                dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
                dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
            }
@ -3689,7 +3692,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
        {
            for( x = 0; x < size.width; x++ )
            {
-                int fxy = src2 ? src2[x] : 0;
+                int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
                dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
                dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
            }
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@ -18,6 +18,8 @@ class_ignore_list = (
 const_ignore_list = (
    "CV_CAP_OPENNI",
    "CV_CAP_PROP_OPENNI_",
    "CV_CAP_INTELPERC",
    "CV_CAP_PROP_INTELPERC_"
    "WINDOW_AUTOSIZE",
    "CV_WND_PROP_",
    "CV_WINDOW_",
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Copyright (C) 2013, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@ -66,8 +67,8 @@ uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols,
 uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
 {
 #ifdef DISABLE_IMAGE2D
-    int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
+    int x = clamp(round(coord.x), 0, cols - 1);
-    int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
+    int y = clamp(round(coord.y), 0, rows - 1);
    return img[elemPerRow * y + x];
 #else
    return (uchar)read_imageui(img, sam, coord).x;
@ -98,6 +99,7 @@ __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
 #define CV_PI_F 3.14159265f
 #endif
 // Use integral image to calculate haar wavelets.
 // N = 2
 // for simple haar paatern
@ -114,10 +116,10 @@ float icvCalcHaarPatternSum_2(
    F d = 0;
-    int2 dx1 = convert_int2_rte(ratio * src[0]);
+    int2 dx1 = convert_int2(round(ratio * src[0]));
-    int2 dy1 = convert_int2_rte(ratio * src[1]);
+    int2 dy1 = convert_int2(round(ratio * src[1]));
-    int2 dx2 = convert_int2_rte(ratio * src[2]);
+    int2 dx2 = convert_int2(round(ratio * src[2]));
-    int2 dy2 = convert_int2_rte(ratio * src[3]);
+    int2 dy2 = convert_int2(round(ratio * src[3]));
    F t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
@ -136,106 +138,9 @@ float icvCalcHaarPatternSum_2(
    return (float)d;
 }
 // N = 3
 float icvCalcHaarPatternSum_3(
    IMAGE_INT32 sumTex,
    __constant float4 *src,
    int oldSize,
    int newSize,
    int y, int x,
    int rows, int cols, int elemPerRow)
 {
    float ratio = (float)newSize / oldSize;
    F d = 0;
    int4 dx1 = convert_int4_rte(ratio * src[0]);
    int4 dy1 = convert_int4_rte(ratio * src[1]);
    int4 dx2 = convert_int4_rte(ratio * src[2]);
    int4 dy2 = convert_int4_rte(ratio * src[3]);
    F t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
    t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
    t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
    return (float)d;
 }
 // N = 4
 float icvCalcHaarPatternSum_4(
    IMAGE_INT32 sumTex,
    __constant float4 *src,
    int oldSize,
    int newSize,
    int y, int x,
    int rows, int cols, int elemPerRow)
 {
    float ratio = (float)newSize / oldSize;
    F d = 0;
    int4 dx1 = convert_int4_rte(ratio * src[0]);
    int4 dy1 = convert_int4_rte(ratio * src[1]);
    int4 dx2 = convert_int4_rte(ratio * src[2]);
    int4 dy2 = convert_int4_rte(ratio * src[3]);
    F t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
    t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
    t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
    t = 0;
    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
    d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
    return (float)d;
 }
 ////////////////////////////////////////////////////////////////////////
 // Hessian
 __constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
 __constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
 __constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
 __inline int calcSize(int octave, int layer)
 {
    /* Wavelet size at first layer of first octave. */
@ -250,6 +155,24 @@ __inline int calcSize(int octave, int layer)
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }
 // Calculate a derivative in an axis-aligned direction (x or y).  The "plus1"
 // boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area).
 // So the final computation is plus1a + plus1b - 2 * minus2.  The corners are
 // labeled A, B, C, and D, with A being the top left, B being top right, C
 // being bottom left, and D being bottom right.
 F calcAxisAlignedDerivative(
        int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale,
        int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale,
        int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale)
 {
    F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D;
    F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D;
    F minus2 = minus2_A - minus2_B - minus2_C + minus2_D;
    return (plus1a / plus1a_scale -
            2.0f * minus2 / minus2_scale +
            plus1b / plus1b_scale);
 }
 //calculate targeted layer per-pixel determinant and trace with an integral image
 __kernel void icvCalcLayerDetAndTrace(
@ -264,7 +187,7 @@ __kernel void icvCalcLayerDetAndTrace(
    int c_octave,
    int c_layer_rows,
    int sumTex_step
-)
+    )
 {
    det_step   /= sizeof(*det);
    trace_step /= sizeof(*trace);
@ -288,16 +211,103 @@ __kernel void icvCalcLayerDetAndTrace(
    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
    {
-        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        int x = j << c_octave;
-        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        int y = i << c_octave;
-        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+
        float ratio = (float)size / 9;
        // Precompute some commonly used values, which are used to offset
        // texture coordinates in the integral image.
        int r1 = round(ratio);
        int r2 = round(ratio * 2.0f);
        int r3 = round(ratio * 3.0f);
        int r4 = round(ratio * 4.0f);
        int r5 = round(ratio * 5.0f);
        int r6 = round(ratio * 6.0f);
        int r7 = round(ratio * 7.0f);
        int r8 = round(ratio * 8.0f);
        int r9 = round(ratio * 9.0f);
        // Calculate the approximated derivative in the x-direction
        F d = 0;
        {
            // Some of the pixels needed to compute the derivative are
            // repeated, so we only don't duplicate the fetch here.
            int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step );
            int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step );
            int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step );
            int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step );
            int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step );
            int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step );
            int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step );
            int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step );
            d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
                                          t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
                                          t32, t37, t62, t67, (r6 - r3) * (r7 - r2));
        }
        const float dx  = (float)d;
        // Calculate the approximated derivative in the y-direction
        d = 0;
        {
            // Some of the pixels needed to compute the derivative are
            // repeated, so we only don't duplicate the fetch here.
            int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step );
            int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step );
            int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step );
            int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step );
            int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step );
            int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step );
            int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step );
            int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step );
            d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
                                          t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
                                          t23, t26, t73, t76, (r7 - r2) * (r6 - r3));
        }
        const float dy  = (float)d;
        // Calculate the approximated derivative in the xy-direction
        d = 0;
        {
            // There's no saving us here, we just have to get all of the pixels in
            // separate fetches
            F t = 0;
            t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step );
            t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step );
            d += t / ((r4 - r1) * (r4 - r1));
            t = 0;
            t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step );
            t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step );
            d -= t / ((r8 - r5) * (r4 - r1));
            t = 0;
            t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step );
            t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step );
            d -= t / ((r4 - r1) * (r8 - r5));
            t = 0;
            t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step );
            t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step );
            t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step );
            d += t / ((r8 - r5) * (r8 - r5));
        }
        const float dxy = (float)d;
        det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
    }
 }
 ////////////////////////////////////////////////////////////////////////
 // NONMAX
@ -309,10 +319,10 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
    float d = 0;
-    int dx1 = convert_int_rte(ratio * c_DM[0]);
+    int dx1 = round(ratio * c_DM[0]);
-    int dy1 = convert_int_rte(ratio * c_DM[1]);
+    int dy1 = round(ratio * c_DM[1]);
-    int dx2 = convert_int_rte(ratio * c_DM[2]);
+    int dx2 = round(ratio * c_DM[2]);
-    int dy2 = convert_int_rte(ratio * c_DM[3]);
+    int dy2 = round(ratio * c_DM[3]);
    float t = 0;
@ -572,7 +582,7 @@ void icvFindMaximaInLayer(
 }
 // solve 3x3 linear system Ax=b for floating point input
-inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local  const float *b, volatile __local  float *x)
+inline bool solve3x3_float(const float4 *A, const float *b, float *x)
 {
    float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
                - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
@ -651,7 +661,7 @@ void icvInterpolateKeypoint(
    if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
    {
-        volatile __local  float dD[3];
+        float dD[3];
        //dx
        dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
@ -660,7 +670,7 @@ void icvInterpolateKeypoint(
        //ds
        dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
-        volatile __local  float4 H[3];
+        float4 H[3];
        //dxx
        H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
@ -681,7 +691,7 @@ void icvInterpolateKeypoint(
        //dss
        H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
-        volatile __local  float x[3];
+        float x[3];
        if (solve3x3_float(H, dD, x))
        {
@ -711,7 +721,7 @@ void icvInterpolateKeypoint(
                sampled in a circle of radius 6s using wavelets of size 4s.
                We ensure the gradient wavelet size is even to ensure the
                wavelet pattern is balanced and symmetric around its center */
-                const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+                const int grad_wav_size = 2 * round(2.0f * s);
                // check when grad_wav_size is too big
                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
@ -737,9 +747,12 @@ void icvInterpolateKeypoint(
 ////////////////////////////////////////////////////////////////////////
 // Orientation
-#define ORI_SEARCH_INC 5
+#define ORI_WIN			 60
-#define ORI_WIN        60
+#define ORI_SAMPLES		 113
-#define ORI_SAMPLES    113
+
 // The distance between samples in the beginning of the the reduction
 #define ORI_RESPONSE_REDUCTION_WIDTH		 48
 #define ORI_RESPONSE_ARRAY_SIZE			     (ORI_RESPONSE_REDUCTION_WIDTH * 2)
 __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
@ -833,12 +846,15 @@ void icvCalcOrientation(
    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
-    volatile __local  float s_X[128];
+    __local  float s_X[ORI_SAMPLES];
-    volatile __local  float s_Y[128];
+    __local  float s_Y[ORI_SAMPLES];
-    volatile __local  float s_angle[128];
+    __local  float s_angle[ORI_SAMPLES];
-    volatile __local  float s_sumx[32 * 4];
+    // Need to allocate enough to make the reduction work without accessing
-    volatile __local  float s_sumy[32 * 4];
+    // past the end of the array.
    __local  float s_sumx[ORI_RESPONSE_ARRAY_SIZE];
    __local  float s_sumy[ORI_RESPONSE_ARRAY_SIZE];
    __local  float s_mod[ORI_RESPONSE_ARRAY_SIZE];
    /* The sampling intervals and wavelet sized for selecting an orientation
    and building the keypoint descriptor are defined relative to 's' */
@ -849,28 +865,60 @@ void icvCalcOrientation(
    sampled in a circle of radius 6s using wavelets of size 4s.
    We ensure the gradient wavelet size is even to ensure the
    wavelet pattern is balanced and symmetric around its center */
-    const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+    const int grad_wav_size = 2 * round(2.0f * s);
    // check when grad_wav_size is too big
    if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
        return;
    // Calc X, Y, angle and store it to shared memory
-    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    const int tid = get_local_id(0);
    // Initialize values that are only used as part of the reduction later.
    if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) {
        s_mod[tid + ORI_LOCAL_SIZE] = 0.0f;
    }
-    float X = 0.0f, Y = 0.0f, angle = 0.0f;
+    float ratio = (float)grad_wav_size / 4;
-    if (tid < ORI_SAMPLES)
+    int r2 = round(ratio * 2.0);
    int r4 = round(ratio * 4.0);
    for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
    {
        float X = 0.0f, Y = 0.0f, angle = 0.0f;
        const float margin = (float)(grad_wav_size - 1) / 2.0f;
-        const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
+        const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
-        const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
+        const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
-                x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
+            x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
        {
-            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+
-            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+            float apt = c_aptW[i];
            // Compute the haar sum without fetching duplicate pixels.
            float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step);
            float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step);
            float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step);
            float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step);
            float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step);
            float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step);
            float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step);
            float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step);
            F t = t00 - t04 - t20 + t24;
            X -= t / ((r2) * (r4));
            t = t20 - t24 - t40 + t44;
            X += t / ((r4 - r2) * (r4));
            t = t00 - t02 - t40 + t42;
            Y += t / ((r2) * (r4));
            t = t02 - t04 - t42 + t44;
            Y -= t  / ((r4) * (r4 - r2));
            X = apt*X;
            Y = apt*Y;
            angle = atan2(Y, X);
@ -879,76 +927,61 @@ void icvCalcOrientation(
            angle *= 180.0f / CV_PI_F;
        }
        s_X[i] = X;
        s_Y[i] = Y;
        s_angle[i] = angle;
    }
    s_X[tid] = X;
    s_Y[tid] = Y;
    s_angle[tid] = angle;
    barrier(CLK_LOCAL_MEM_FENCE);
    float bestx = 0, besty = 0, best_mod = 0;
    float sumx = 0.0f, sumy = 0.0f;
    const int dir = tid * ORI_SEARCH_INC;
    #pragma unroll
    for (int i = 0; i < ORI_SAMPLES; ++i) {
        int angle = round(s_angle[i]);
-#pragma unroll
+        int d = abs(angle - dir);
-    for (int i = 0; i < 18; ++i)
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-    {
+        {
-        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
+            sumx += s_X[i];
            sumy += s_Y[i];
        }
    }
    s_sumx[tid] = sumx;
    s_sumy[tid] = sumy;
    s_mod[tid] = sumx*sumx + sumy*sumy;
    barrier(CLK_LOCAL_MEM_FENCE);
-        volatile float sumx = 0.0f, sumy = 0.0f;
+    // This reduction searches for the longest wavelet response vector.  The first
-        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
+    // step uses all of the work items in the workgroup to narrow the search
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+    // down to the three candidates.  It requires s_mod to have a few more
-        {
+    // elements alocated past the work-group size, which are pre-initialized to
-            sumx = s_X[get_local_id(0)];
+    // 0.0f above.
-            sumy = s_Y[get_local_id(0)];
+    for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) {
-        }
+        if (tid < t) {
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
+            if (s_mod[tid] < s_mod[tid + t]) {
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                s_mod[tid] = s_mod[tid + t];
-        {
+                s_sumx[tid] = s_sumx[tid + t];
-            sumx += s_X[get_local_id(0) + 32];
+                s_sumy[tid] = s_sumy[tid + t];
-            sumy += s_Y[get_local_id(0) + 32];
+            }
        }
        d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
        {
            sumx += s_X[get_local_id(0) + 64];
            sumy += s_Y[get_local_id(0) + 64];
        }
        d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
        {
            sumx += s_X[get_local_id(0) + 96];
            sumy += s_Y[get_local_id(0) + 96];
        }
        reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
        reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
        const float temp_mod = sumx * sumx + sumy * sumy;
        if (temp_mod > best_mod)
        {
            best_mod = temp_mod;
            bestx = sumx;
            besty = sumy;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (get_local_id(0) == 0)
    {
        s_X[get_local_id(1)] = bestx;
        s_Y[get_local_id(1)] = besty;
        s_angle[get_local_id(1)] = best_mod;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(1) == 0 && get_local_id(0) == 0)
+    // Do the final reduction and write out the result.
    if (tid == 0)
    {
        int bestIdx = 0;
-        if (s_angle[1] > s_angle[bestIdx])
+        // The loop above narrowed the search of the longest vector to three
        // possibilities.  Pick the best here.
        if (s_mod[1] > s_mod[bestIdx])
            bestIdx = 1;
-        if (s_angle[2] > s_angle[bestIdx])
+        if (s_mod[2] > s_mod[bestIdx])
            bestIdx = 2;
        if (s_angle[3] > s_angle[bestIdx])
            bestIdx = 3;
-        float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
+        float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]);
        if (kp_dir < 0)
            kp_dir += 2.0f * CV_PI_F;
        kp_dir *= 180.0f / CV_PI_F;
@ -961,7 +994,6 @@ void icvCalcOrientation(
    }
 }
 __kernel
 void icvSetUpright(
    __global float * keypoints,
@ -1035,8 +1067,8 @@ inline float linearFilter(
    float out = 0.0f;
-    const int x1 = convert_int_rtn(x);
+    const int x1 = round(x);
-    const int y1 = convert_int_rtn(y);
+    const int y1 = round(y);
    const int x2 = x1 + 1;
    const int y2 = y1 + 1;
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@ -46,6 +46,7 @@
 #ifdef HAVE_OPENCV_OCL
 #include <cstdio>
 #include <sstream>
 #include "opencl_kernels.hpp"
 using namespace cv;
@ -57,18 +58,25 @@ namespace cv
 {
    namespace ocl
    {
        // The number of degrees between orientation samples in calcOrientation
        const static int ORI_SEARCH_INC = 5;
        // The local size of the calcOrientation kernel
        const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC);
        static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
            size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
        {
-            char optBuf [100] = {0};
+            std::stringstream optsStr;
-            char * optBufPtr = optBuf;
+            optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
            optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
            cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
            size_t wave_size = queryWaveFrontSize(kernel);
            CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
-            sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
+            optsStr << "-D WAVE_SIZE=" << wave_size;
-            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
        }
    }
 }
@ -601,8 +609,8 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
-    size_t localThreads[3]  = {32, 4, 1};
+    size_t localThreads[3]  = {ORI_LOCAL_SIZE, 1, 1};
-    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
+    size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1};
    openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
 }
--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU
 ----------------------------------------
 Creates a separable linear filter engine.
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
@ -303,6 +303,8 @@ Creates a separable linear filter engine.
    :param bordertype: Pixel extrapolation method.
    :param imgSize: Source image size to choose optimal method for processing.
 .. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU
 ------------------------------
 Creates a filter engine for the generalized Sobel operator.
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator.
    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
    :param imgSize: Source image size to choose optimal method for processing.
 .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU
 ---------------------------------
 Creates a Gaussian filter engine.
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
+.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
@ -417,6 +421,8 @@ Creates a Gaussian filter engine.
    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
    :param imgSize: Source image size to choose optimal method for processing.
 .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
 ocl::GaussianBlur
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@ -695,17 +695,17 @@ namespace cv
        //! returns the separable linear filter engine
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
-                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
+                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
        //! returns the separable filter engine with the specified filters
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
                const Ptr<BaseColumnFilter_GPU> &columnFilter);
        //! returns the Gaussian filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
+        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
        //! returns filter engine for the generalized Sobel operator
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
+        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) );
        //! applies Laplacian operator to the image
        // supports only ksize = 1 and ksize = 3
@ -1439,8 +1439,10 @@ namespace cv
            oclMat Dx_;
            oclMat Dy_;
            oclMat eig_;
            oclMat eig_minmax_;
            oclMat minMaxbuf_;
            oclMat tmpCorners_;
            oclMat counter_;
        };
        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
--- a/modules/ocl/src/color.cpp
+++ b/modules/ocl/src/color.cpp
@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
 {
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
    int pixels_per_work_item = 1;
-    String build_options = format("-D DEPTH_%d", src.depth());
+    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
    {
        if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
            pixels_per_work_item =  4;
        else if (src.cols % 2 == 0)
            pixels_per_work_item =  2;
        else
            pixels_per_work_item =  1;
    }
    String build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item);
    if (!additionalOptions.empty())
        build_options = build_options + additionalOptions;
@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
    if (!data2.empty())
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
    size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
 #else
    size_t lt[3] = { 16, 16, 1 };
 #endif
    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }
 static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                           const std::string & additionalOptions = std::string(),
                           const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat())
 {
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
    std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx);
    if (!additionalOptions.empty())
        build_options += additionalOptions;
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
    if (!data1.empty())
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data1.data ));
    if (!data2.empty())
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
   size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
 #else
    size_t lt[3] = { 16, 16, 1 };
 #endif
    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }
 static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
    if (!additionalOptions.empty())
        build_options += additionalOptions;
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
    if (!data.empty())
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
    size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
 static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
-    String build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
+    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
    int pixels_per_work_item = 1;
    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
    {
        if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
            pixels_per_work_item =  4;
        else if (src.cols % 2 == 0)
            pixels_per_work_item =  2;
        else
            pixels_per_work_item =  1;
    }
    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item);
    if (!additionalOptions.empty())
        build_options += additionalOptions;
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
    if (!data.empty())
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
    size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
 #else
    size_t lt[3] = { 16, 16, 1 };
 #endif
    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }
 static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
    String build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
    if (!additionalOptions.empty())
        build_options = build_options + additionalOptions;
@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@ -119,10 +238,13 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }
-static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
+static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
 {
-    String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
+    std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
-                                  dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
+    if (!additionalOptions.empty())
        build_options += additionalOptions;
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
@ -136,6 +258,36 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
    if (!data.empty())
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
    size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
 #else
    size_t lt[3] = { 16, 16, 1 };
 #endif
    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
 }
 static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
 {
    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
    String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s",
                                  src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
    size_t gt[3] = { dst.cols, dst.rows, 1 };
 #ifdef ANDROID
    size_t lt[3] = { 16, 10, 1 };
@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
 static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
 {
-    String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
+    String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d",
-                                  src.depth(), greenbits, dst.channels());
+                                  src.depth(), greenbits, dst.channels(), bidx);
    int src_offset = src.offset >> 1, src_step = src.step >> 1;
    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();
@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
 static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
 {
-    String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
+    String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d",
-                                  src.depth(), greenbits, src.channels());
+                                  src.depth(), greenbits, src.channels(), bidx);
    int src_offset = (int)src.offset, src_step = (int)src.step;
    int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;
@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
        CV_Assert(scn == 1);
        dcn  = code == COLOR_GRAY2BGRA ? 4 : 3;
        dst.create(sz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, 0, "Gray2RGB");
+        fromGray_caller(src, dst, 0, "Gray2RGB");
        break;
    }
    case COLOR_BGR2YUV: case COLOR_RGB2YUV:
@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
        Size dstSz(sz.width, sz.height * 2 / 3);
        dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
+        toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12");
        break;
    }
    case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
                initialized = true;
            }
-            fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
+            toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
            return;
        }
-        fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
+        toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
        break;
    }
    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
        dst.create(sz, CV_MAKETYPE(depth, dcn));
        std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
-        toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
+        fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
        break;
    }
    case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@ -741,6 +741,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke
    f->apply(src, dst);
 }
 const int optimizedSepFilterLocalSize = 16;
 static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst,
                                   const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT)
 {
    size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1};
    size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
    unsigned int src_pitch = src.step;
    unsigned int dst_pitch = dst.step;
    int src_offset_x = (src.offset % src.step) / src.elemSize();
    int src_offset_y = src.offset / src.step;
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src_offset_x ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src_offset_y ));
    args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&dst.data ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.offset ));
    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholecols ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholerows ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.cols ));
    args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dst.rows ));
    String option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1],
        row_kernel.rows / 2, col_kernel.rows / 2 );
    option += " -D KERNEL_MATRIX_X=";
    for(int i=0; i<row_kernel.rows; i++)
        option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &row_kernel.at<float>(i) ) );
    option += "0x0";
    option += " -D KERNEL_MATRIX_Y=";
    for(int i=0; i<col_kernel.rows; i++)
        option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &col_kernel.at<float>(i) ) );
    option += "0x0";
    switch(src.type())
    {
    case CV_8UC1:
        option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float";
        break;
    case CV_32FC1:
        option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float";
        break;
    case CV_8UC2:
        option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2";
        break;
    case CV_32FC2:
        option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2";
        break;
    case CV_8UC3:
        option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3";
        break;
    case CV_32FC3:
        option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3";
        break;
    case CV_8UC4:
        option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4";
        break;
    case CV_32FC4:
        option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4";
        break;
    default:
        CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
        break;
    }
    switch(dst.type())
    {
    case CV_8UC1:
        option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat";
        break;
    case CV_8UC2:
        option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat";
        break;
    case CV_8UC3:
        option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat";
        break;
    case CV_8UC4:
        option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat";
        break;
    case CV_32FC1:
        option += " -D DSTTYPE=float -D CONVERT_DSTTYPE=";
        break;
    case CV_32FC2:
        option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE=";
        break;
    case CV_32FC3:
        option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE=";
        break;
    case CV_32FC4:
        option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE=";
        break;
    default:
        CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
        break;
    }
    switch(bordertype)
    {
    case cv::BORDER_CONSTANT:
        option += " -D BORDER_CONSTANT";
        break;
    case cv::BORDER_REPLICATE:
        option += " -D BORDER_REPLICATE";
        break;
    case cv::BORDER_REFLECT:
        option += " -D BORDER_REFLECT";
        break;
    case cv::BORDER_REFLECT101:
        option += " -D BORDER_REFLECT_101";
        break;
    case cv::BORDER_WRAP:
        option += " -D BORDER_WRAP";
        break;
    default:
        CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
        break;
    }
    openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args,
        -1, -1, option.c_str() );
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // SeparableFilter
@ -790,6 +919,35 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
    return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
 }
 namespace
 {
 class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU
 {
 public:
    SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype )
    {
        bordertype = btype;
        rowKernel = rowKernel_;
        columnKernel = columnKernel_;
    }
    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
    {
        normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size());
        oclMat srcROI = src(roi);
        oclMat dstROI = dst(roi);
        sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype);
    }
    Mat rowKernel;
    Mat columnKernel;
    int bordertype;
 };
 }
 static void GPUFilterBox(const oclMat &src, oclMat &dst,
                         Size &ksize, const Point anchor, const int borderType)
 {
@ -1243,17 +1401,32 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
 }
 Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType,
-        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype)
+        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize )
 {
    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
    int cn = CV_MAT_CN(srcType);
    int bdepth = std::max(std::max(sdepth, ddepth), CV_32F);
    int bufType = CV_MAKETYPE(bdepth, cn);
    Context* clCxt = Context::getContext();
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
+    //if image size is non-degenerate and large enough
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
+    //and if filter support is reasonable to satisfy larger local memory requirements,
    //then we can use single pass routine to avoid extra runtime calls overhead
    if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) &&
        rowKernel.rows <= 21 && columnKernel.rows <= 21 &&
        (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 &&
        imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) &&
        imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) )
    {
        return Ptr<FilterEngine_GPU>(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype));
    }
    else
    {
        Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
        Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
-    return createSeparableFilter_GPU(rowFilter, columnFilter);
+        return createSeparableFilter_GPU(rowFilter, columnFilter);
    }
 }
 void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
@ -1277,16 +1450,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
+    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size());
    f->apply(src, dst);
 }
-Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
+Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize )
 {
    Mat kx, ky;
    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
    return createSeparableLinearFilter_GPU(srcType, dstType,
-                                           kx, ky, Point(-1, -1), 0, borderType);
+                                           kx, ky, Point(-1, -1), 0, borderType, imgSize);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -1356,7 +1529,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
-Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype)
+Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize)
 {
    int depth = CV_MAT_DEPTH(type);
@ -1383,7 +1556,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
    else
        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
+    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize);
 }
 void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
@ -1419,7 +1592,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
    dst.create(src.size(), src.type());
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
+    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size());
    f->apply(src, dst);
 }
--- a/modules/ocl/src/gftt.cpp
+++ b/modules/ocl/src/gftt.cpp
@ -48,154 +48,142 @@
 using namespace cv;
 using namespace cv::ocl;
 // currently sort procedure on the host is more efficient
 static bool use_cpu_sorter = true;
-namespace
+// compact structure for corners
 struct DefCorner
 {
-enum SortMethod
+    float eig;  //eigenvalue of corner
    short x;    //x coordinate of corner point
    short y;    //y coordinate of corner point
 } ;
 // compare procedure for corner
 //it is used for sort on the host side
 struct DefCornerCompare
 {
-    CPU_STL,
+    bool operator()(const DefCorner a, const DefCorner b) const
    BITONIC,
    SELECTION
 };
 const int GROUP_SIZE = 256;
 template<SortMethod method>
 struct Sorter
 {
    //typedef EigType;
 };
 //TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
 template<>
 struct Sorter<CPU_STL>
 {
    typedef oclMat EigType;
    static cv::Mutex cs;
    static Mat mat_eig;
    //prototype
    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
    {
-        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
+        return a.eig > b.eig;
        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
        return v1 > v2;
    }
    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
    {
        cv::AutoLock lock(cs);
        //temporarily use STL's sort function
        Mat mat_corners = corners;
        mat_eig = eig_tex;
        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
        corners = mat_corners;
    }
 };
 cv::Mutex Sorter<CPU_STL>::cs;
 cv::Mat   Sorter<CPU_STL>::mat_eig;
-template<>
+// sort corner point using opencl bitonicosrt implementation
-struct Sorter<BITONIC>
+static void sortCorners_caller(oclMat& corners, const int count)
 {
-    typedef TextureCL EigType;
+    Context * cxt = Context::getContext();
    int     GS = count/2;
    int     LS = min(255,GS);
    size_t  globalThreads[3] = {GS, 1, 1};
    size_t  localThreads[3]  = {LS, 1, 1};
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    // 2^numStages should be equal to count or the output is invalid
    int numStages = 0;
    for(int i = count; i > 1; i >>= 1)
    {
-        Context * cxt = Context::getContext();
+        ++numStages;
-        size_t globalThreads[3] = {count / 2, 1, 1};
+    }
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+    const int argc = 4;
-
+    std::vector< std::pair<size_t, const void *> > args(argc);
-        // 2^numStages should be equal to count or the output is invalid
+    std::string kernelname = "sortCorners_bitonicSort";
-        int numStages = 0;
+    args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
-        for(int i = count; i > 1; i >>= 1)
+    args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
    for(int stage = 0; stage < numStages; ++stage)
    {
        args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
        for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
        {
-            ++numStages;
+            args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
-        }
+            openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
        const int argc = 5;
        std::vector< std::pair<size_t, const void *> > args(argc);
        String kernelname = "sortCorners_bitonicSort";
        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
        for(int stage = 0; stage < numStages; ++stage)
        {
            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
            for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
            {
                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
                openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
            }
        }
    }
-};
+}
-template<>
+// find corners on matrix and put it into array
-struct Sorter<SELECTION>
+static void findCorners_caller(
-{
+    const oclMat&   eig_mat,        //input matrix worth eigenvalues
-    typedef TextureCL EigType;
+    oclMat&         eigMinMax,      //input with min and max values of eigenvalues
-
+    const float     qualityLevel,
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    const oclMat&   mask,
-    {
+    oclMat&         corners,        //output array with detected corners
-        Context * cxt = Context::getContext();
+    oclMat&         counter)        //output value with number of detected corners, have to be 0 before call
        size_t globalThreads[3] = {count, 1, 1};
        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
        std::vector< std::pair<size_t, const void *> > args;
        //local
        String kernelname = "sortCorners_selectionSortLocal";
        int lds_size = GROUP_SIZE * sizeof(cl_float2);
        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
        //final
        kernelname = "sortCorners_selectionSortFinal";
        args.pop_back();
        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
    }
 };
 int findCorners_caller(
    const TextureCL& eig,
    const float threshold,
    const oclMat& mask,
    oclMat& corners,
    const int max_count)
 {
    String  opt;
    std::vector<int> k;
    Context * cxt = Context::getContext();
    std::vector< std::pair<size_t, const void*> > args;
    String kernelname = "findCorners";
    const int mask_strip = mask.step / mask.elemSize1();
-    oclMat g_counter(1, 1, CV_32SC1);
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&(eig_mat.data)));
    g_counter.setTo(0);
-    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eig  ));
+    int src_pitch = (int)eig_mat.step;
    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&src_pitch ));
    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&mask.data ));
    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&corners.data ));
    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&mask_strip));
-    args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eigMinMax.data ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
+    args.push_back(std::make_pair( sizeof(cl_float), (void*)&qualityLevel ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&eig_mat.rows ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&eig_mat.cols ));
-    args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&corners.cols ));
    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&counter.data ));
-    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+    size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
    size_t localThreads[3]  = {16, 16, 1};
    if(!mask.empty())
        opt += " -D WITH_MASK=1";
-    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
+     openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
-    openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
+}
-    return std::min(Mat(g_counter).at<int>(0), max_count);
+
 static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
 {
    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
    CV_Assert(groupnum != 0);
    int dbsize = groupnum * 2 * src.elemSize();
    ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
    cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
    int all_cols = src.step / src.elemSize();
    int pre_cols = (src.offset % src.step) / src.elemSize();
    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
    int invalid_cols = pre_cols + sec_cols;
    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
    int offset = src.offset / src.elemSize();
    {// first parallel pass
        std::vector<std::pair<size_t , const void *> > args;
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
        size_t globalThreads[3] = {groupnum * 256, 1, 1};
        size_t localThreads[3] = {256, 1, 1};
        openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
                            args, -1, -1, "-D T=float -D DEPTH_5");
    }
    {// run final "serial" kernel to find accumulate results from threads and reset corner counter
        std::vector<std::pair<size_t , const void *> > args;
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum ));
        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
        size_t globalThreads[3] = {1, 1, 1};
        size_t localThreads[3] = {1, 1, 1};
        openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
                            args, -1, -1);
    }
 }
 }//unnamed namespace
 void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
 {
@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
    ensureSizeIsEnough(image.size(), CV_32F, eig_);
    if (useHarrisDetector)
-        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+        cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
    else
        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
-    double maxVal = 0;
+    ensureSizeIsEnough(1,1, CV_32SC1, counter_);
    minMax(eig_, NULL, &maxVal);
-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+    // find max eigenvalue and reset detected counters
    minMaxEig_caller(eig_,eig_minmax_,counter_);
-    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
+    // allocate buffer for kernels
-    int total = findCorners_caller(
+    int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
-        *eig_tex,
+
-        static_cast<float>(maxVal * qualityLevel),
+    if(!use_cpu_sorter)
    {   // round to 2^n
        unsigned int n=1;
        for(n=1;n<(unsigned int)corner_array_size;n<<=1);
        corner_array_size = (int)n;
        ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
        // set to 0 to be able use bitonic sort on whole 2^n array
        tmpCorners_.setTo(0);
    }
    else
    {
        ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
    }
    int total = tmpCorners_.cols; // by default the number of corner is full array
    std::vector<DefCorner>   tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
    //find points with high eigenvalue and put it into the output array
    findCorners_caller(
        eig_,
        eig_minmax_,
        static_cast<float>(qualityLevel),
        mask,
        tmpCorners_,
-        tmpCorners_.cols);
+        counter_);
    if(!use_cpu_sorter)
    {// sort detected corners on deivce side
        sortCorners_caller(tmpCorners_, corner_array_size);
    }
    else
    {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
    }
    //blocking read whole corners array (sorted or not sorted)
    openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
    if (total == 0)
-    {
+    {// check for trivial case
        corners.release();
        return;
    }
    if(use_cpu_sorter)
-    {
+    {// sort detected corners on cpu side.
-        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
+        tmp.resize(total);
-    }
+        std::sort(tmp.begin(), tmp.end(), DefCornerCompare());
    else
    {
        //if total is power of 2
        if(((total - 1) & (total)) == 0)
        {
            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
        }
        else
        {
            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
        }
    }
    //estimate maximal size of final output array
    int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
    int D2 = (int)ceil(minDistance * minDistance);
    // allocate output buffer
    std::vector<Point2f> tmp2;
    tmp2.reserve(total_max);
    if (minDistance < 1)
-    {
+    {// we have not distance restriction. then just copy with conversion maximal allowed points into output array
-        Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
+        for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
-        tmpCorners_(roi_range).copyTo(corners);
+        {
            tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
        }
    }
    else
-    {
+    {// we have distance restriction. then start coping to output array from the first element and check distance for each next one
        std::vector<Point2f> tmp(total);
        downloadPoints(tmpCorners_, tmp);
        std::vector<Point2f> tmp2;
        tmp2.reserve(total);
        const int cell_size = cvRound(minDistance);
        const int grid_width = (image.cols + cell_size - 1) / cell_size;
        const int grid_height = (image.rows + cell_size - 1) / cell_size;
-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+        std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
-        for (int i = 0; i < total; ++i)
+        for (int i = 0; i < total ; ++i)
        {
-            Point2f p = tmp[i];
+            DefCorner p = tmp[i];
            if(p.eig<=0.0f)
                break; // condition to stop that is needed for GPU bitonic sort usage.
            bool good = true;
@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
            {
                for (int xx = x1; xx <= x2; xx++)
                {
-                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
+                    std::vector<Point2i>& m = grid[yy * grid_width + xx];
-
+                    if (m.empty())
-                    if (!m.empty())
+                        continue;
                    for(size_t j = 0; j < m.size(); j++)
                    {
-                        for(size_t j = 0; j < m.size(); j++)
+                        int dx = p.x - m[j].x;
-                        {
+                        int dy = p.y - m[j].y;
                            float dx = p.x - m[j].x;
                            float dy = p.y - m[j].y;
-                            if (dx * dx + dy * dy < minDistance * minDistance)
+                        if (dx * dx + dy * dy < D2)
-                            {
+                        {
-                                good = false;
+                            good = false;
-                                goto break_out;
+                            goto break_out_;
                            }
                        }
                    }
                }
            }
-            break_out:
+            break_out_:
            if(good)
            {
-                grid[y_cell * grid_width + x_cell].push_back(p);
+                grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
-                tmp2.push_back(p);
+                tmp2.push_back(Point2f(p.x,p.y));
                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
                    break;
            }
        }
        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
    }
    int final_size = static_cast<int>(tmp2.size());
    if(final_size>0)
        corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
    else
        corners.release();
 }
 void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
 {
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@ -866,16 +866,17 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
        if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
        {
-            //setup local group size
+            //setup local group size for "pixel step" = 1
-            localThreads[0] = 8;
+            localThreads[0] = 16;
-            localThreads[1] = 16;
+            localThreads[1] = 32;
            localThreads[2] = 1;
-            //init maximal number of workgroups
+            //calc maximal number of workgroups
            int WGNumX = 1+(sizev[0].width /(localThreads[0]));
            int WGNumY = 1+(sizev[0].height/(localThreads[1]));
            int WGNumZ = loopcount;
-            int WGNum = 0; //accurate number of non -empty workgroups
+            int WGNumTotal = 0; //accurate number of non-empty workgroups
            int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
            oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
            {
                cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
@ -895,12 +896,16 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
                            if(gx>=(Width-cascade->orig_window_size.width))
                                continue; // no data to process
                            if(scaleinfo[z].factor<=2)
                            {
                                WGNumSampled++;
                            }
                            // save no-empty workgroup info into array
-                            pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
+                            pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height;
-                            pWGInfo[WGNum].s[1] = (gx << 16) | gy;
+                            pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy;
-                            pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
+                            pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff;
-                            memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
+                            memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float));
-                            WGNum++;
+                            WGNumTotal++;
                        }
                    }
                }
@ -908,13 +913,8 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
                pWGInfo = NULL;
            }
            // setup global sizes to have linear array of workgroups with WGNum size
            globalThreads[0] = localThreads[0]*WGNum;
            globalThreads[1] = localThreads[1];
            globalThreads[2] = 1;
 #define NODE_SIZE 12
-            // pack node info to have less memory loads
+            // pack node info to have less memory loads on the device side
            oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
            {
                cl_int  status;
@ -963,8 +963,6 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
            options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
            options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
            options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
            options += format(" -D LSx=%d",localThreads[0]);
            options += format(" -D LSy=%d",localThreads[1]);
            options += format(" -D SPLITNODE=%d",splitnode);
            options += format(" -D SPLITSTAGE=%d",splitstage);
            options += format(" -D OUTPUTSZ=%d",outputsz);
@ -972,8 +970,39 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
            // init candiate global count by 0
            int pattern = 0;
            openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
-            // execute face detector
+
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+            if(WGNumTotal>WGNumSampled)
            {// small images and each pixel is processed
                // setup global sizes to have linear array of workgroups with WGNum size
                int     pixelstep = 1;
                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
                globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled);
                globalThreads[1] = LS[1];
                globalThreads[2] = 1;
                String options1 = options;
                options1 += format(" -D PIXEL_STEP=%d",pixelstep);
                options1 += format(" -D WGSTART=%d",WGNumSampled);
                options1 += format(" -D LSx=%d",LS[0]);
                options1 += format(" -D LSy=%d",LS[1]);
                // execute face detector
                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str());
            }
            if(WGNumSampled>0)
            {// large images each 4th pixel is processed
                // setup global sizes to have linear array of workgroups with WGNum size
                int     pixelstep = 2;
                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
                globalThreads[0] = LS[0]*WGNumSampled;
                globalThreads[1] = LS[1];
                globalThreads[2] = 1;
                String options2 = options;
                options2 += format(" -D PIXEL_STEP=%d",pixelstep);
                options2 += format(" -D WGSTART=%d",0);
                options2 += format(" -D LSx=%d",LS[0]);
                options2 += format(" -D LSy=%d",LS[1]);
                // execute face detector
                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str());
            }
            //read candidate buffer back and put it into host list
            openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
            assert(candidate[0]<outputsz);
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@ -76,6 +76,11 @@ namespace cv
                int cdescr_width;
                int cdescr_height;
                // A shift value and type that allows qangle to be different
                // sizes on different hardware
                int qangle_step_shift;
                int qangle_type;
                void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                                      int nblocks_win_x, int nblocks_win_y);
@ -153,6 +158,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
        hog_device_cpu = true;
    else
        hog_device_cpu = false;
 }
 size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@ -213,7 +219,7 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
        effect_size = img.size();
    grad.create(img.size(), CV_32FC2);
-    qangle.create(img.size(), CV_8UC2);
+    qangle.create(img.size(), hog::qangle_type);
    const size_t block_hist_size = getBlockHistogramSize();
    const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
@ -1606,6 +1612,16 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
    int descr_size = descr_width * nblocks_win_y;
    cdescr_size = descr_size;
    qangle_type = CV_8UC2;
    qangle_step_shift = 0;
    // Some Intel devices have low single-byte access performance,
    // so we change the datatype here.
    if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
    {
        qangle_type = CV_32SC2;
        qangle_step_shift = 2;
    }
 }
 void cv::ocl::device::hog::compute_hists(int nbins,
@ -1627,7 +1643,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
    int blocks_total = img_block_width * img_block_height;
    int grad_quadstep = grad.step >> 2;
-    int qangle_step = qangle.step;
+    int qangle_step = qangle.step >> qangle_step_shift;
    int blocks_in_group = 4;
    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
    char correctGamma = (correct_gamma) ? 1 : 0;
    int img_step = img.step;
    int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
+    int qangle_step = qangle.step >> (1 + qangle_step_shift);
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
    char correctGamma = (correct_gamma) ? 1 : 0;
    int img_step = img.step >> 2;
    int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
+    int qangle_step = qangle.step >> (1 + qangle_step_shift);
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@ -1035,67 +1035,117 @@ namespace cv
            else
                scale = 1. / scale;
-            if (ksize > 0)
+            const int sobel_lsz = 16;
            if((src.type() == CV_8UC1 || src.type() == CV_32FC1) &&
                (ksize==3 || ksize==5 || ksize==7 || ksize==-1) &&
                src.wholerows > sobel_lsz + (ksize>>1) &&
                src.wholecols > sobel_lsz + (ksize>>1))
            {
-                Context* clCxt = Context::getContext();
+                Dx.create(src.size(), CV_32FC1);
-                if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
+                Dy.create(src.size(), CV_32FC1);
-                    src.cols % 8 == 0 && src.rows % 8 == 0 &&
+
-                    ksize==3 &&
+                CV_Assert(Dx.rows == Dy.rows && Dx.cols == Dy.cols);
-                    (borderType ==cv::BORDER_REFLECT ||
+
-                     borderType == cv::BORDER_REPLICATE ||
+                size_t lt2[3] = {sobel_lsz, sobel_lsz, 1};
-                     borderType ==cv::BORDER_REFLECT101 ||
+                size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
-                     borderType ==cv::BORDER_WRAP))
+
                unsigned int src_pitch = src.step;
                unsigned int Dx_pitch = Dx.step;
                unsigned int Dy_pitch = Dy.step;
                int src_offset_x = (src.offset % src.step) / src.elemSize();
                int src_offset_y = src.offset / src.step;
                float _scale = scale;
                std::vector<std::pair<size_t , const void *> > args;
                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x ));
                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y ));
                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&Dx.data ));
                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.offset ));
                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dx_pitch ));
                args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&Dy.data ));
                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dy.offset ));
                args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dy_pitch ));
                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholecols ));
                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.wholerows ));
                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.cols ));
                args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&Dx.rows ));
                args.push_back( std::make_pair( sizeof(cl_float), (void *)&_scale ));
                String option = cv::format("-D BLK_X=%d -D BLK_Y=%d",(int)lt2[0],(int)lt2[1]);
                switch(src.type())
                {
-                    Dx.create(src.size(), CV_32FC1);
+                case CV_8UC1:
-                    Dy.create(src.size(), CV_32FC1);
+                    option += " -D SRCTYPE=uchar";
-
+                    break;
-                    const unsigned int block_x = 8;
+                case CV_32FC1:
-                    const unsigned int block_y = 8;
+                    option += " -D SRCTYPE=float";
-
+                    break;
                    unsigned int src_pitch = src.step;
                    unsigned int dst_pitch = Dx.cols;
                    float _scale = scale;
                    std::vector<std::pair<size_t , const void *> > args;
                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
                    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
                    size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
                    String option = "-D BLK_X=8 -D BLK_Y=8";
                    switch(borderType)
                    {
                    case cv::BORDER_REPLICATE:
                        option += " -D BORDER_REPLICATE";
                        break;
                    case cv::BORDER_REFLECT:
                        option += " -D BORDER_REFLECT";
                        break;
                    case cv::BORDER_REFLECT101:
                        option += " -D BORDER_REFLECT101";
                        break;
                    case cv::BORDER_WRAP:
                        option += " -D BORDER_WRAP";
                        break;
                    }
                    openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
                }
-                else
+                switch(borderType)
                {
                case cv::BORDER_CONSTANT:
                    option += " -D BORDER_CONSTANT";
                    break;
                case cv::BORDER_REPLICATE:
                    option += " -D BORDER_REPLICATE";
                    break;
                case cv::BORDER_REFLECT:
                    option += " -D BORDER_REFLECT";
                    break;
                case cv::BORDER_REFLECT101:
                    option += " -D BORDER_REFLECT_101";
                    break;
                case cv::BORDER_WRAP:
                    option += " -D BORDER_WRAP";
                    break;
                default:
                    CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
                    break;
                }
                String kernel_name;
                switch(ksize)
                {
                case -1:
                    option += " -D SCHARR";
                    kernel_name = "sobel3";
                    break;
                case 3:
                    kernel_name = "sobel3";
                    break;
                case 5:
                    kernel_name = "sobel5";
                    break;
                case 7:
                    kernel_name = "sobel7";
                    break;
                default:
                    CV_Error(CV_StsBadFlag, "Kernel size is not supported!");
                    break;
                }
                openCLExecuteKernel(src.clCxt, &imgproc_sobel3, kernel_name, gt2, lt2, args, -1, -1, option.c_str() );
            }
            else
            {
                if (ksize > 0)
                {
                    Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
                    Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
                }
-            }
+                else
-            else
+                {
-            {
+                    Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
-                Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
+                    Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
-                Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+                }
            }
            CV_Assert(Dx.offset == 0 && Dy.offset == 0);
        }
--- a/modules/ocl/src/opencl/cvt_color.cl
+++ b/modules/ocl/src/opencl/cvt_color.cl
--- a/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl
+++ b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl
@ -0,0 +1,185 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2013, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef BORDER_CONSTANT
 //CCCCCC|abcdefgh|CCCCCCC
 #define EXTRAPOLATE(x, maxV)
 #elif defined BORDER_REPLICATE
 //aaaaaa|abcdefgh|hhhhhhh
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = max(min((x), (maxV) - 1), 0); \
    }
 #elif defined BORDER_WRAP
 //cdefgh|abcdefgh|abcdefg
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = ( (x) + (maxV) ) % (maxV); \
    }
 #elif defined BORDER_REFLECT
 //fedcba|abcdefgh|hgfedcb
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
    }
 #elif defined BORDER_REFLECT_101
 //gfedcb|abcdefgh|gfedcba
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
    }
 #else
 #error No extrapolation method
 #endif
 #define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
 #ifdef BORDER_CONSTANT
 //CCCCCC|abcdefgh|CCCCCCC
 #define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
 #else
 #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
 #endif
 #define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x])
 //horizontal and vertical filter kernels
 //should be defined on host during compile time to avoid overhead
 __constant uint mat_kernelX[] = {KERNEL_MATRIX_X};
 __constant uint mat_kernelY[] = {KERNEL_MATRIX_Y};
 __kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass
        (
        __global uchar* Src,
        const uint      SrcPitch,
        const int       srcOffsetX,
        const int       srcOffsetY,
        __global uchar* Dst,
        const int       DstOffset,
        const uint      DstPitch,
        int             width,
        int             height,
        int             dstWidth,
        int             dstHeight
        )
 {
    //RADIUSX, RADIUSY are filter dimensions
    //BLK_X, BLK_Y are local wrogroup sizes
    //all these should be defined on host during compile time
    //first lsmem array for source pixels used in first pass,
    //second lsmemDy for storing first pass results
    __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX];
    __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX];
    //get local and global ids - used as image and local memory array indexes
    int lix = get_local_id(0);
    int liy = get_local_id(1);
    int x = (int)get_global_id(0);
    int y = (int)get_global_id(1);
    //calculate pixel position in source image taking image offset into account
    int srcX = x + srcOffsetX - RADIUSX;
    int srcY = y + srcOffsetY - RADIUSY;
    int xb = srcX;
    int yb = srcY;
    //extrapolate coordinates, if needed
    //and read my own source pixel into local memory
    //with account for extra border pixels, which will be read by starting workitems
    int clocY = liy;
    int cSrcY = srcY;
    do
    {
        int yb = cSrcY;
        EXTRAPOLATE(yb, (height));
        int clocX = lix;
        int cSrcX = srcX;
        do
        {
            int xb = cSrcX;
            EXTRAPOLATE(xb,(width));
            lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
            clocX += BLK_X;
            cSrcX += BLK_X;
        }
        while(clocX < BLK_X+(RADIUSX*2));
        clocY += BLK_Y;
        cSrcY += BLK_Y;
    }
    while(clocY < BLK_Y+(RADIUSY*2));
    barrier(CLK_LOCAL_MEM_FENCE);
    //do vertical filter pass
    //and store intermediate results to second local memory array
    int i;
    WORKTYPE sum = 0.0f;
    int clocX = lix;
    do
    {
        sum = 0.0f;
        for(i=0; i<=2*RADIUSY; i++)
            sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum);
        lsmemDy[liy][clocX] = sum;
        clocX += BLK_X;
    }
    while(clocX < BLK_X+(RADIUSX*2));
    barrier(CLK_LOCAL_MEM_FENCE);
    //if this pixel happened to be out of image borders because of global size rounding,
    //then just return
    if( x >= dstWidth || y >=dstHeight )  return;
    //do second horizontal filter pass
    //and calculate final result
    sum = 0.0f;
    for(i=0; i<=2*RADIUSX; i++)
        sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum);
    //store result into destination image
    DST(x,y) = CONVERT_DSTTYPE(sum);
 }
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    )
 {
 // this version used information provided for each workgroup
 // no empty WG
    int     gid = (int)get_group_id(0);
    int     lid_x = (int)get_local_id(0);
    int     lid_y = (int)get_local_id(1);
    int     lid = lid_y*LSx+lid_x;
-    int4    WGInfo = pWGInfo[gid];
+    int4    WGInfo = pWGInfo[WGSTART+gid];
    int     GroupX = (WGInfo.y >> 16)&0xFFFF;
    int     GroupY = (WGInfo.y >> 0 )& 0xFFFF;
    int     Width  = (WGInfo.x >> 16)&0xFFFF;
@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    int     ImgOffset = WGInfo.z;
    float   ScaleFactor = as_float(WGInfo.w);
-#define DATA_SIZE_X (LSx+WND_SIZE_X)
+#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X)
-#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
+#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y)
 #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
    local int SumL[DATA_SIZE];
@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    int4    info1 = p;
    int4    info2 = pq;
-    {
+    // calc processed ROI coordinate in local mem
-        int     xl = lid_x;
+    int     xl = lid_x*PIXEL_STEP;
-        int     yl = lid_y;
+    int     yl = lid_y*PIXEL_STEP;
    {// calc variance_norm_factor for all stages
        int     OffsetLocal =          yl * DATA_SIZE_X +         xl;
        int     OffsetGlobal = (GroupY+yl)* pixelstep   + (GroupX+xl);
@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    int result = (1.0f>0.0f);
    for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
-    {// iterate until candidate is exist
+    {// iterate until candidate is valid
        float   stage_sum = 0.0f;
        __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
            ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
        int     lcl_off = (yl*DATA_SIZE_X)+(xl);
        int stagecount = stageinfo->count;
        float stagethreshold = stageinfo->threshold;
        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
        for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
        {
        // simple macro to extract shorts from int
@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
            int4    n1 = pN[1];
            int4    n2 = pN[2];
            float   nodethreshold  = as_float(n2.y) * variance_norm_factor;
-            // calc sum of intensity pixels according to node information
+            // calc sum of intensity pixels according to classifier node information
            float classsum =
                (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
                (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
        int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
        if(index<OUTPUTSZ)
        {
-            int     x = GroupX+lid_x;
+            int     x = GroupX+xl;
-            int     y = GroupY+lid_y;
+            int     y = GroupY+yl;
            int4 candidate_result;
            candidate_result.x = convert_int_rtn(x*ScaleFactor);
            candidate_result.y = convert_int_rtn(y*ScaleFactor);
--- a/modules/ocl/src/opencl/imgproc_gftt.cl
+++ b/modules/ocl/src/opencl/imgproc_gftt.cl
@ -46,33 +46,26 @@
 #ifndef WITH_MASK
 #define WITH_MASK 0
 #endif
-
+//macro to read eigenvalue matrix
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]
 inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
 {
    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
 }
 inline float ELEM_FLT2(image2d_t _eig, float2 pt)
 {
    return read_imagef(_eig, sampler, pt).x;
 }
 __kernel
    void findCorners
    (
-        image2d_t eig,
+        __global const char*    eig,
-        __global const char * mask,
+        const int               eig_pitch,
-        __global float2 * corners,
+        __global const char*    mask,
-        const int mask_strip,// in pixels
+        __global float2*        corners,
-        const float threshold,
+        const int               mask_strip,// in pixels
-        const int rows,
+        __global const float*   pMinMax,
-        const int cols,
+        const float             qualityLevel,
-        const int max_count,
+        const int               rows,
-        __global int * g_counter
+        const int               cols,
        const int               max_count,
        __global int*           g_counter
    )
 {
    float threshold = qualityLevel*pMinMax[1];
    const int j = get_global_id(0);
    const int i = get_global_id(1);
@ -82,39 +75,42 @@ __kernel
 #endif
        )
    {
-        const float val = ELEM_INT2(eig, j, i);
+        const float val = GET_SRC_32F(j, i);
        if (val > threshold)
        {
            float maxVal = val;
            maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
            maxVal = fmax(GET_SRC_32F(j    , i - 1), maxVal);
            maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);
            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
+            maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
+            maxVal = fmax(GET_SRC_32F(j    , i + 1), maxVal);
-
+            maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);
            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
            if (val == maxVal)
            {
                const int ind = atomic_inc(g_counter);
                if (ind < max_count)
-                    corners[ind] = (float2)(j, i);
+                {// pack and store eigenvalue and its coordinates
                    corners[ind].x = val;
                    corners[ind].y = as_float(j|(i<<16));
                }
            }
        }
    }
 }
 #undef GET_SRC_32F
 //bitonic sort
 __kernel
    void sortCorners_bitonicSort
    (
        image2d_t eig,
        __global float2 * corners,
        const int count,
        const int stage,
@ -140,8 +136,8 @@ __kernel
    const float2 leftPt  = corners[leftId];
    const float2 rightPt = corners[rightId];
-    const float leftVal  = ELEM_FLT2(eig, leftPt);
+    const float leftVal  = leftPt.x;
-    const float rightVal = ELEM_FLT2(eig, rightPt);
+    const float rightVal = rightPt.x;
    const bool compareResult = leftVal > rightVal;
@ -152,124 +148,22 @@ __kernel
    corners[rightId] = sortOrder ? greater : lesser;
 }
-//selection sort for gfft
+// this is simple short serial kernel that makes some short reduction and initialization work
-//kernel is ported from Bolt library:
+// it makes HOST like work to avoid additional sync with HOST to do this short work
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+// data - input/output float2.
-//  Local sort will firstly sort elements of each workgroup using selection sort
+//      input data are sevral (min,max) pairs
-//  its performance is O(n)
+//      output data is one reduced (min,max) pair
-__kernel
+// g_counter - counter that have to be initialized by 0 for next findCorner call.
-    void sortCorners_selectionSortLocal
+__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
    (
        image2d_t eig,
        __global float2 * corners,
        const int count,
        __local float2 * scratch
    )
 {
-    int          i  = get_local_id(0); // index in workgroup
+    g_counter[0] = 0;
-    int numOfGroups = get_num_groups(0); // index in workgroup
+    float minVal = data[0];
-    int groupID     = get_group_id(0);
+    float maxVal = data[groupnum];
-    int         wg  = get_local_size(0); // workgroup size = block size
+    for(int i=1;i<groupnum;++i)
    int n; // number of elements to be processed for this work group
    int offset   = groupID * wg;
    int same     = 0;
    corners      += offset;
    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
    float2 pt1, pt2;
    pt1 = corners[min(i, n)];
    scratch[i] = pt1;
    barrier(CLK_LOCAL_MEM_FENCE);
    if(i >= n)
    {
-        return;
+        minVal = min(minVal,data[i]);
        maxVal = max(maxVal,data[i+groupnum]);
    }
-
+    data[0] = minVal;
-    float val1 = ELEM_FLT2(eig, pt1);
+    data[1] = maxVal;
    float val2;
    int pos = 0;
    for (int j=0;j<n;++j)
    {
        pt2  = scratch[j];
        val2 = ELEM_FLT2(eig, pt2);
        if(val2 > val1)
            pos++;//calculate the rank of this element in this work group
        else
        {
            if(val1 > val2)
                continue;
            else
            {
                // val1 and val2 are same
                same++;
            }
        }
    }
    for (int j=0; j< same; j++)
        corners[pos + j] = pt1;
 }
 __kernel
    void sortCorners_selectionSortFinal
    (
        image2d_t eig,
        __global float2 * corners,
        const int count
    )
 {
    const int          i  = get_local_id(0); // index in workgroup
    const int numOfGroups = get_num_groups(0); // index in workgroup
    const int groupID     = get_group_id(0);
    const int         wg  = get_local_size(0); // workgroup size = block size
    int pos = 0, same = 0;
    const int offset = get_group_id(0) * wg;
    const int remainder = count - wg*(numOfGroups-1);
    if((offset + i ) >= count)
        return;
    float2 pt1, pt2;
    pt1 = corners[groupID*wg + i];
    float val1 = ELEM_FLT2(eig, pt1);
    float val2;
    for(int j=0; j<numOfGroups-1; j++ )
    {
        for(int k=0; k<wg; k++)
        {
            pt2  = corners[j*wg + k];
            val2 = ELEM_FLT2(eig, pt2);
            if(val1 > val2)
                break;
            else
            {
                //Increment only if the value is not the same.
                if( val2 > val1 )
                    pos++;
                else
                    same++;
            }
        }
    }
    for(int k=0; k<remainder; k++)
    {
        pt2  = corners[(numOfGroups-1)*wg + k];
        val2 = ELEM_FLT2(eig, pt2);
        if(val1 > val2)
            break;
        else
        {
            //Don't increment if the value is the same.
            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
            if(val2 > val1)
                pos++;
            else
                same++;
        }
    }
    for (int j=0; j< same; j++)
        corners[pos + j] = pt1;
 }
--- a/modules/ocl/src/opencl/imgproc_sobel3.cl
+++ b/modules/ocl/src/opencl/imgproc_sobel3.cl
@ -1,45 +1,97 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
+
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#ifdef BORDER_CONSTANT
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+//CCCCCC|abcdefgh|CCCCCCC
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define EXTRAPOLATE(x, maxV)
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#elif defined BORDER_REPLICATE
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+//aaaaaa|abcdefgh|hhhhhhh
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = max(min((x), (maxV) - 1), 0); \
    }
 #elif defined BORDER_WRAP
 //cdefgh|abcdefgh|abcdefg
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = ( (x) + (maxV) ) % (maxV); \
    }
 #elif defined BORDER_REFLECT
 //fedcba|abcdefgh|hgfedcb
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \
    }
 #elif defined BORDER_REFLECT_101
 //gfedcb|abcdefgh|gfedcba
 #define EXTRAPOLATE(x, maxV) \
    { \
        (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \
    }
 #else
 #error No extrapolation method
 #endif
-#ifdef BORDER_REFLECT
+#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#ifdef BORDER_CONSTANT
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+//CCCCCC|abcdefgh|CCCCCCC
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#else
 #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
 #endif
-#ifdef BORDER_REFLECT101
+#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x])
-//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
+#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x])
 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
 #define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
 #define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
 #endif
-#ifdef BORDER_WRAP
+#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+    int srcX = x + srcOffsetX - (kernel_border); \
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+    int srcY = y + srcOffsetY - (kernel_border); \
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+    int xb = srcX; \
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+    int yb = srcY; \
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+    \
-#endif
+    EXTRAPOLATE(xb, (width)); \
    EXTRAPOLATE(yb, (height)); \
    lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
    \
    if(lix < ((kernel_border)*2)) \
    { \
        int xb = srcX+BLK_X; \
        EXTRAPOLATE(xb,(width)); \
        lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
    } \
    if(liy< ((kernel_border)*2)) \
    { \
        int yb = srcY+BLK_Y; \
        EXTRAPOLATE(yb, (height)); \
        lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
    } \
    if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \
    { \
        int xb = srcX+BLK_X; \
        int yb = srcY+BLK_Y; \
        EXTRAPOLATE(xb,(width)); \
        EXTRAPOLATE(yb,(height)); \
        lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
    }
 __kernel void sobel3(
        __global uchar* Src,
-        __global float* DstX,
+        const uint      SrcPitch,
-        __global float* DstY,
+        const int       srcOffsetX,
-        int width, int height,
+        const int       srcOffsetY,
-        uint srcStride, uint dstStride,
+        __global uchar* DstX,
-        float scale
+        const int       DstXOffset,
        const uint      DstXPitch,
        __global uchar* DstY,
        const int       DstYOffset,
        const uint      DstYPitch,
        int             width,
        int             height,
        int             dstWidth,
        int             dstHeight,
        float           scale
        )
 {
    __local float lsmem[BLK_Y+2][BLK_X+2];
@ -47,62 +99,249 @@ __kernel void sobel3(
    int lix = get_local_id(0);
    int liy = get_local_id(1);
-    int gix = get_group_id(0);
+    int x = (int)get_global_id(0);
-    int giy = get_group_id(1);
+    int y = (int)get_global_id(1);
    int id_x = get_global_id(0);
    int id_y = get_global_id(1);
    lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
    int id_y_h = ADDR_H(id_y-1, 0,height);
    int id_y_b = ADDR_B(id_y+1, height,id_y+1);
    int id_x_l = ADDR_L(id_x-1, 0,width);
    int id_x_r = ADDR_R(id_x+1, width,id_x+1);
    if(liy==0)
    {
        lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
        if(lix==0)
            lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
        else if(lix==BLK_X-1)
            lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
    }
    else if(liy==BLK_Y-1)
    {
        lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
        if(lix==0)
            lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
        else if(lix==BLK_X-1)
            lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
    }
    if(lix==0)
        lsmem[liy+1][0]    = convert_float(Src[ id_y * srcStride + id_x_l ]);
    else if(lix==BLK_X-1)
        lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1)
    barrier(CLK_LOCAL_MEM_FENCE);
    if( x >= dstWidth || y >=dstHeight )  return;
    float u1 = lsmem[liy][lix];
    float u2 = lsmem[liy][lix+1];
    float u3 = lsmem[liy][lix+2];
    float m1 = lsmem[liy+1][lix];
    float m2 = lsmem[liy+1][lix+1];
    float m3 = lsmem[liy+1][lix+2];
    float b1 = lsmem[liy+2][lix];
    float b2 = lsmem[liy+2][lix+1];
    float b3 = lsmem[liy+2][lix+2];
-    //m2 * scale;//
+    //calc and store dx and dy;//
-    float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
+#ifdef SCHARR
-    DstX[ id_y * dstStride + id_x ] = dx * scale;
+    DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale;
-
+    DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale;
-    float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
+#else
-    DstY[ id_y * dstStride + id_x ] = dy * scale;
+    DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale;
    DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale;
 #endif
 }
 __kernel void sobel5(
        __global uchar* Src,
        const uint      SrcPitch,
        const int       srcOffsetX,
        const int       srcOffsetY,
        __global uchar* DstX,
        const int       DstXOffset,
        const uint      DstXPitch,
        __global uchar* DstY,
        const int       DstYOffset,
        const uint      DstYPitch,
        int             width,
        int             height,
        int             dstWidth,
        int             dstHeight,
        float           scale
        )
 {
    __local float lsmem[BLK_Y+4][BLK_X+4];
    int lix = get_local_id(0);
    int liy = get_local_id(1);
    int x = (int)get_global_id(0);
    int y = (int)get_global_id(1);
    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2)
    barrier(CLK_LOCAL_MEM_FENCE);
    if( x >= dstWidth || y >=dstHeight )  return;
    float t1 = lsmem[liy][lix];
    float t2 = lsmem[liy][lix+1];
    float t3 = lsmem[liy][lix+2];
    float t4 = lsmem[liy][lix+3];
    float t5 = lsmem[liy][lix+4];
    float u1 = lsmem[liy+1][lix];
    float u2 = lsmem[liy+1][lix+1];
    float u3 = lsmem[liy+1][lix+2];
    float u4 = lsmem[liy+1][lix+3];
    float u5 = lsmem[liy+1][lix+4];
    float m1 = lsmem[liy+2][lix];
    float m2 = lsmem[liy+2][lix+1];
    float m4 = lsmem[liy+2][lix+3];
    float m5 = lsmem[liy+2][lix+4];
    float l1 = lsmem[liy+3][lix];
    float l2 = lsmem[liy+3][lix+1];
    float l3 = lsmem[liy+3][lix+2];
    float l4 = lsmem[liy+3][lix+3];
    float l5 = lsmem[liy+3][lix+4];
    float b1 = lsmem[liy+4][lix];
    float b2 = lsmem[liy+4][lix+1];
    float b3 = lsmem[liy+4][lix+2];
    float b4 = lsmem[liy+4][lix+3];
    float b5 = lsmem[liy+4][lix+4];
    //calc and store dx and dy;//
    DSTX(x,y) = scale *
        mad(12.0f, m4 - m2,
            mad(6.0f, m5 - m1,
                mad(8.0f, u4 - u2 + l4 - l2,
                    mad(4.0f, u5 - u1 + l5 - l1,
                        mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 )
                        )
                    )
                )
            );
    DSTY(x,y) = scale *
        mad(12.0f, l3 - u3,
            mad(6.0f, b3 - t3,
                mad(8.0f, l2 - u2 + l4 - u4,
                    mad(4.0f, b2 - t2 + b4 - t4,
                        mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 )
                        )
                    )
                )
            );
 }
 __kernel void sobel7(
        __global uchar* Src,
        const uint      SrcPitch,
        const int       srcOffsetX,
        const int       srcOffsetY,
        __global uchar* DstX,
        const int       DstXOffset,
        const uint      DstXPitch,
        __global uchar* DstY,
        const int       DstYOffset,
        const uint      DstYPitch,
        int             width,
        int             height,
        int             dstWidth,
        int             dstHeight,
        float           scale
        )
 {
    __local float lsmem[BLK_Y+6][BLK_X+6];
    int lix = get_local_id(0);
    int liy = get_local_id(1);
    int x = (int)get_global_id(0);
    int y = (int)get_global_id(1);
    INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3)
    barrier(CLK_LOCAL_MEM_FENCE);
    if( x >= dstWidth || y >=dstHeight )  return;
    float tt1 = lsmem[liy][lix];
    float tt2 = lsmem[liy][lix+1];
    float tt3 = lsmem[liy][lix+2];
    float tt4 = lsmem[liy][lix+3];
    float tt5 = lsmem[liy][lix+4];
    float tt6 = lsmem[liy][lix+5];
    float tt7 = lsmem[liy][lix+6];
    float t1 = lsmem[liy+1][lix];
    float t2 = lsmem[liy+1][lix+1];
    float t3 = lsmem[liy+1][lix+2];
    float t4 = lsmem[liy+1][lix+3];
    float t5 = lsmem[liy+1][lix+4];
    float t6 = lsmem[liy+1][lix+5];
    float t7 = lsmem[liy+1][lix+6];
    float u1 = lsmem[liy+2][lix];
    float u2 = lsmem[liy+2][lix+1];
    float u3 = lsmem[liy+2][lix+2];
    float u4 = lsmem[liy+2][lix+3];
    float u5 = lsmem[liy+2][lix+4];
    float u6 = lsmem[liy+2][lix+5];
    float u7 = lsmem[liy+2][lix+6];
    float m1 = lsmem[liy+3][lix];
    float m2 = lsmem[liy+3][lix+1];
    float m3 = lsmem[liy+3][lix+2];
    float m5 = lsmem[liy+3][lix+4];
    float m6 = lsmem[liy+3][lix+5];
    float m7 = lsmem[liy+3][lix+6];
    float l1 = lsmem[liy+4][lix];
    float l2 = lsmem[liy+4][lix+1];
    float l3 = lsmem[liy+4][lix+2];
    float l4 = lsmem[liy+4][lix+3];
    float l5 = lsmem[liy+4][lix+4];
    float l6 = lsmem[liy+4][lix+5];
    float l7 = lsmem[liy+4][lix+6];
    float b1 = lsmem[liy+5][lix];
    float b2 = lsmem[liy+5][lix+1];
    float b3 = lsmem[liy+5][lix+2];
    float b4 = lsmem[liy+5][lix+3];
    float b5 = lsmem[liy+5][lix+4];
    float b6 = lsmem[liy+5][lix+5];
    float b7 = lsmem[liy+5][lix+6];
    float bb1 = lsmem[liy+6][lix];
    float bb2 = lsmem[liy+6][lix+1];
    float bb3 = lsmem[liy+6][lix+2];
    float bb4 = lsmem[liy+6][lix+3];
    float bb5 = lsmem[liy+6][lix+4];
    float bb6 = lsmem[liy+6][lix+5];
    float bb7 = lsmem[liy+6][lix+6];
    //calc and store dx and dy
    DSTX(x,y) = scale *
        mad(100.0f, m5 - m3,
            mad(80.0f, m6 - m2,
                mad(20.0f, m7 - m1,
                    mad(75.0f, u5 - u3 + l5 - l3,
                        mad(60.0f, u6 - u2 + l6 - l2,
                            mad(15.0f, u7 - u1 + l7 - l1,
                                mad(30.0f, t5 - t3 + b5 - b3,
                                    mad(24.0f, t6 - t2 + b6 - b2,
                                        mad(6.0f, t7 - t1 + b7 - b1,
                                            mad(5.0f, tt5 - tt3 + bb5 - bb3,
                                                mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 )
                                                )
                                            )
                                        )
                                    )
                                )
                            )
                        )
                    )
                )
            );
    DSTY(x,y) = scale *
        mad(100.0f, l4 - u4,
            mad(80.0f, b4 - t4,
                mad(20.0f, bb4 - tt4,
                    mad(75.0f, l5 - u5 + l3 - u3,
                        mad(60.0f, b5 - t5 + b3 - t3,
                            mad(15.0f, bb5 - tt5 + bb3 - tt3,
                                mad(30.0f, l6 - u6 + l2 - u2,
                                    mad(24.0f, b6 - t6 + b2 - t2,
                                        mad(6.0f, bb6 - tt6 + bb2 - tt2,
                                            mad(5.0f, l7 - u7 + l1 - u1,
                                                mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 )
                                                )
                                            )
                                        )
                                    )
                                )
                            )
                        )
                    )
                )
            );
 }
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/ocl/src/opencl/objdetect_hog.cl
@ -50,6 +50,14 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 #ifdef INTEL_DEVICE
 #define QANGLE_TYPE		int
 #define QANGLE_TYPE2	int2
 #else
 #define QANGLE_TYPE		uchar
 #define QANGLE_TYPE2	uchar2
 #endif
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel(
    const int cnbins, const int cblock_hist_size, const int img_block_width,
    const int blocks_in_group, const int blocks_total,
    const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
+    __global const float* grad, __global const QANGLE_TYPE* qangle,
    __global const float* gauss_w_lut,
    __global float* block_hists, __local float* smem)
 {
@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel(
    __global const float* grad_ptr = (gid < blocks_total) ?
        grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+    __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
        qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
    __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel(
    for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
    {
        float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+        QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
        grad_ptr += grad_quadstep;
        qangle_ptr += qangle_step;
@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel(
 __kernel void compute_gradients_8UC4_kernel(
    const int height, const int width,
    const int img_step, const int grad_quadstep, const int qangle_step,
-    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
    const float angle_scale, const char correct_gamma, const int cnbins)
 {
    const int x = get_global_id(0);
@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel(
 __kernel void compute_gradients_8UC1_kernel(
    const int height, const int width,
    const int img_step, const int grad_quadstep, const int qangle_step,
-    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
    const float angle_scale, const char correct_gamma, const int cnbins)
 {
    const int x = get_global_id(0);
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@ -116,7 +116,7 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool
    Mat m(size0, type);
-    rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
+    rng.fill(m, RNG::UNIFORM, minVal, maxVal);
    if( size0 == size )
        return m;
    return m(Rect((size0.width-size.width)/2, (size0.height-size.height)/2, size.width, size.height));
@ -142,7 +142,7 @@ Mat randomMat(RNG& rng, const vector<int>& size, int type, double minVal, double
    Mat m(dims, &size0[0], type);
-    rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
+    rng.fill(m, RNG::UNIFORM, minVal, maxVal);
    if( eqsize )
        return m;
    return m(&r[0]);
--- a/platforms/linux/arm-gnueabi.toolchain.cmake
+++ b/platforms/linux/arm-gnueabi.toolchain.cmake
@ -28,14 +28,11 @@ set(CMAKE_MODULE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-s
 set(CMAKE_EXE_LINKER_FLAGS    "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_EXE_LINKER_FLAGS}")
 if(USE_NEON)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
+  message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." )
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
+  set(ENABLE_NEON TRUE)
 elseif(USE_VFPV3)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3")
+  message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." )
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3")
+  set(ENABLE_VFPV3 TRUE)
 else()
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3-d16")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3-d16")
 endif()
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT})
--- a/samples/cpp/intelperc_capture.cpp
+++ b/samples/cpp/intelperc_capture.cpp
@ -0,0 +1,376 @@
 // testOpenCVCam.cpp : Defines the entry point for the console application.
 //
 #include "opencv2/highgui/highgui.hpp"
 #include <iostream>
 using namespace cv;
 using namespace std;
 static bool g_printStreamSetting        = false;
 static int g_imageStreamProfileIdx      = -1;
 static int g_depthStreamProfileIdx      = -1;
 static bool g_irStreamShow              = false;
 static double g_imageBrightness         = -DBL_MAX;
 static double g_imageContrast           = -DBL_MAX;
 static bool g_printTiming               = false;
 static bool g_showClosedPoint           = false;
 static int g_closedDepthPoint[2];
 static void printUsage(const char *arg0)
 {
    const char *filename = arg0;
    while (*filename)
        filename++;
    while ((arg0 <= filename) && ('\\' != *filename) && ('/' != *filename))
        filename--;
    filename++;
    cout << "This program demonstrates usage of camera supported\nby Intel Perceptual computing SDK." << endl << endl;
    cout << "usage: " << filename << "[-ps] [-isp IDX] [-dsp IDX]\n [-ir] [-imb VAL] [-imc VAL]" << endl << endl;
    cout << "   -ps,            print streams setting and profiles" << endl;
    cout << "   -isp IDX,       set profile index of the image stream" << endl;
    cout << "   -dsp IDX,       set profile index of the depth stream" << endl;
    cout << "   -ir,            show data from IR stream" << endl;
    cout << "   -imb VAL,       set brighness value for a image stream" << endl;
    cout << "   -imc VAL,       set contrast value for a image stream" << endl;
    cout << "   -pts,           print frame index and frame time" << endl;
    cout << "   --show-closed,  print frame index and frame time" << endl;
    cout <<  endl;
 }
 static void parseCMDLine(int argc, char* argv[])
 {
    if( argc == 1 )
    {
        printUsage(argv[0]);
    }
    else
    {
        for( int i = 1; i < argc; i++ )
        {
            if ((0 == strcmp(argv[i], "--help")) || (0 == strcmp( argv[i], "-h")))
            {
                printUsage(argv[0]);
                exit(0);
            }
            else if ((0 == strcmp( argv[i], "--print-streams")) || (0 == strcmp( argv[i], "-ps")))
            {
                g_printStreamSetting = true;
            }
            else if ((0 == strcmp( argv[i], "--image-stream-prof")) || (0 == strcmp( argv[i], "-isp")))
            {
                g_imageStreamProfileIdx = atoi(argv[++i]);
            }
            else if ((0 == strcmp( argv[i], "--depth-stream-prof")) || (0 == strcmp( argv[i], "-dsp")))
            {
                g_depthStreamProfileIdx = atoi(argv[++i]);
            }
            else if (0 == strcmp( argv[i], "-ir"))
            {
                g_irStreamShow = true;
            }
            else if (0 == strcmp( argv[i], "-imb"))
            {
                g_imageBrightness = atof(argv[++i]);
            }
            else if (0 == strcmp( argv[i], "-imc"))
            {
                g_imageContrast = atof(argv[++i]);
            }
            else if (0 == strcmp(argv[i], "-pts"))
            {
                g_printTiming = true;
            }
            else if (0 == strcmp(argv[i], "--show-closed"))
            {
                g_showClosedPoint = true;
            }
            else
            {
                cout << "Unsupported command line argument: " << argv[i] << "." << endl;
                exit(-1);
            }
        }
        if (g_showClosedPoint && (-1 == g_depthStreamProfileIdx))
        {
            cerr << "For --show-closed depth profile has be selected" << endl;
            exit(-1);
        }
    }
 }
 static void printStreamProperties(VideoCapture &capture)
 {
    size_t profilesCount = (size_t)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT);
    cout << "Image stream." << endl;
    cout << "  Brightness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS) << endl;
    cout << "  Contrast = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_CONTRAST) << endl;
    cout << "  Saturation = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SATURATION) << endl;
    cout << "  Hue = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_HUE) << endl;
    cout << "  Gamma = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAMMA) << endl;
    cout << "  Sharpness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SHARPNESS) << endl;
    cout << "  Gain = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAIN) << endl;
    cout << "  Backligh = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BACKLIGHT) << endl;
    cout << "Image streams profiles:" << endl;
    for (size_t i = 0; i < profilesCount; i++)
    {
        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i);
        cout << "  Profile[" << i << "]: ";
        cout << "width = " <<
            (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_WIDTH);
        cout << ", height = " <<
            (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_HEIGHT);
        cout << ", fps = " <<
            capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FPS);
        cout << endl;
    }
    profilesCount = (size_t)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT);
    cout << "Depth stream." << endl;
    cout << "  Low confidence value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl;
    cout << "  Saturation value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl;
    cout << "  Confidence threshold = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl;
    cout << "  Focal length = (" << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", "
        << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl;
    cout << "Depth streams profiles:" << endl;
    for (size_t i = 0; i < profilesCount; i++)
    {
        capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i);
        cout << "  Profile[" << i << "]: ";
        cout << "width = " <<
            (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_WIDTH);
        cout << ", height = " <<
            (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_HEIGHT);
        cout << ", fps = " <<
            capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FPS);
        cout << endl;
    }
 }
 static void imshowImage(const char *winname, Mat &image, VideoCapture &capture)
 {
    if (g_showClosedPoint)
    {
        Mat uvMap;
        if (capture.retrieve(uvMap, CAP_INTELPERC_UVDEPTH_MAP))
        {
            float *uvmap = (float *)uvMap.ptr() + 2 * (g_closedDepthPoint[0] * uvMap.cols + g_closedDepthPoint[1]);
            int x = (int)((*uvmap) * image.cols); uvmap++;
            int y = (int)((*uvmap) * image.rows);
            if ((0 <= x) && (0 <= y))
            {
                static const int pointSize = 4;
                for (int row = y; row < min(y + pointSize, image.rows); row++)
                {
                    uchar* ptrDst = image.ptr(row) + x * 3 + 2;//+2 -> Red
                    for (int col = 0; col < min(pointSize, image.cols - x); col++, ptrDst+=3)
                    {
                        *ptrDst = 255;
                    }
                }
            }
        }
    }
    imshow(winname, image);
 }
 static void imshowIR(const char *winname, Mat &ir)
 {
    Mat image;
    if (g_showClosedPoint)
    {
        image.create(ir.rows, ir.cols, CV_8UC3);
        for (int row = 0; row < ir.rows; row++)
        {
            uchar* ptrDst = image.ptr(row);
            short* ptrSrc = (short*)ir.ptr(row);
            for (int col = 0; col < ir.cols; col++, ptrSrc++)
            {
                uchar val = (uchar) ((*ptrSrc) >> 2);
                *ptrDst = val;  ptrDst++;
                *ptrDst = val;  ptrDst++;
                *ptrDst = val;  ptrDst++;
            }
        }
        static const int pointSize = 4;
        for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++)
        {
            uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red
            for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3)
            {
                *ptrDst = 255;
            }
        }
    }
    else
    {
        image.create(ir.rows, ir.cols, CV_8UC1);
        for (int row = 0; row < ir.rows; row++)
        {
            uchar* ptrDst = image.ptr(row);
            short* ptrSrc = (short*)ir.ptr(row);
            for (int col = 0; col < ir.cols; col++, ptrSrc++, ptrDst++)
            {
                *ptrDst = (uchar) ((*ptrSrc) >> 2);
            }
        }
    }
    imshow(winname, image);
 }
 static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture)
 {
    short lowValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE);
    short saturationValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE);
    Mat image;
    if (g_showClosedPoint)
    {
        image.create(depth.rows, depth.cols, CV_8UC3);
        for (int row = 0; row < depth.rows; row++)
        {
            uchar* ptrDst = image.ptr(row);
            short* ptrSrc = (short*)depth.ptr(row);
            for (int col = 0; col < depth.cols; col++, ptrSrc++)
            {
                if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc)))
                {
                    *ptrDst = 0; ptrDst++;
                    *ptrDst = 0; ptrDst++;
                    *ptrDst = 0; ptrDst++;
                }
                else
                {
                    uchar val = (uchar) ((*ptrSrc) >> 2);
                    *ptrDst = val;  ptrDst++;
                    *ptrDst = val;  ptrDst++;
                    *ptrDst = val;  ptrDst++;
                }
            }
        }
        static const int pointSize = 4;
        for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++)
        {
            uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red
            for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3)
            {
                *ptrDst = 255;
            }
        }
    }
    else
    {
        image.create(depth.rows, depth.cols, CV_8UC1);
        for (int row = 0; row < depth.rows; row++)
        {
            uchar* ptrDst = image.ptr(row);
            short* ptrSrc = (short*)depth.ptr(row);
            for (int col = 0; col < depth.cols; col++, ptrSrc++, ptrDst++)
            {
                if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc)))
                    *ptrDst = 0;
                else
                    *ptrDst = (uchar) ((*ptrSrc) >> 2);
            }
        }
    }
    imshow(winname, image);
 }
 int main(int argc, char* argv[])
 {
    parseCMDLine(argc, argv);
    VideoCapture capture;
    capture.open(CAP_INTELPERC);
    if (!capture.isOpened())
    {
        cerr << "Can not open a capture object." << endl;
        return -1;
    }
    if (g_printStreamSetting)
        printStreamProperties(capture);
    if (-1 != g_imageStreamProfileIdx)
    {
        if (!capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx))
        {
            cerr << "Can not setup a image stream." << endl;
            return -1;
        }
    }
    if (-1 != g_depthStreamProfileIdx)
    {
        if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx))
        {
            cerr << "Can not setup a depth stream." << endl;
            return -1;
        }
    }
    else if (g_irStreamShow)
    {
        if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, 0.0))
        {
            cerr << "Can not setup a IR stream." << endl;
            return -1;
        }
    }
    else
    {
        cout << "Streams not selected" << endl;
        return 0;
    }
    //Setup additional properies only after set profile of the stream
    if ( (-10000.0 < g_imageBrightness) && (g_imageBrightness < 10000.0))
        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageBrightness);
    if ( (0 < g_imageContrast) && (g_imageContrast < 10000.0))
        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageContrast);
    int frame = 0;
    for(;;frame++)
    {
        Mat bgrImage;
        Mat depthImage;
        Mat irImage;
        if (!capture.grab())
        {
            cout << "Can not grab images." << endl;
            return -1;
        }
        if ((-1 != g_depthStreamProfileIdx) && (capture.retrieve(depthImage, CAP_INTELPERC_DEPTH_MAP)))
        {
            if (g_showClosedPoint)
            {
                double minVal = 0.0; double maxVal = 0.0;
                minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint);
            }
            imshowDepth("depth image", depthImage, capture);
        }
        if ((g_irStreamShow) && (capture.retrieve(irImage, CAP_INTELPERC_IR_MAP)))
            imshowIR("ir image", irImage);
        if ((-1 != g_imageStreamProfileIdx) && (capture.retrieve(bgrImage, CAP_INTELPERC_IMAGE)))
            imshowImage("color image", bgrImage, capture);
        if (g_printTiming)
        {
            cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_FRAMES)
                 << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_FRAMES) << endl;
            cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_MSEC)
                 << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_MSEC) << endl;
        }
        if( waitKey(30) >= 0 )
            break;
    }
    return 0;
 }
--- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
@ -32,13 +32,13 @@ int main()
    for (int i = 0; i < image.rows; ++i)
        for (int j = 0; j < image.cols; ++j)
        {
-            Mat sampleMat = (Mat_<float>(1,2) << i,j);
+            Mat sampleMat = (Mat_<float>(1,2) << j,i);
            float response = SVM.predict(sampleMat);
            if (response == 1)
-                image.at<Vec3b>(j, i)  = green;
+                image.at<Vec3b>(i,j)  = green;
            else if (response == -1)
-                 image.at<Vec3b>(j, i)  = blue;
+                 image.at<Vec3b>(i,j)  = blue;
        }
    // Show the training data
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@ -14,7 +14,10 @@
 using namespace std;
 using namespace cv;
 #define LOOP_NUM 1
 #define MAX_THREADS 10
 ///////////////////////////single-threading faces detecting///////////////////////////////
@ -29,23 +32,23 @@ const static Scalar colors[] =  { CV_RGB(0,0,255),
                                } ;
-int64 work_begin = 0;
+int64 work_begin[MAX_THREADS] = {0};
-int64 work_end = 0;
+int64 work_total[MAX_THREADS] = {0};
 string inputName, outputName, cascadeName;
-static void workBegin()
+static void workBegin(int i = 0)
 {
-    work_begin = getTickCount();
+    work_begin[i] = getTickCount();
 }
-static void workEnd()
+static void workEnd(int i = 0)
 {
-    work_end += (getTickCount() - work_begin);
+    work_total[i] += (getTickCount() - work_begin[i]);
 }
-static double getTime()
+static double getTotalTime(int i = 0)
 {
-    return work_end /((double)cvGetTickFrequency() * 1000.);
+    return work_total[i] /getTickFrequency() * 1000.;
 }
@ -98,7 +101,6 @@ static int facedetect_one_thread(bool useCPU, double scale )
        }
    }
    cvNamedWindow( "result", 1 );
    if( capture )
    {
        cout << "In capture ..." << endl;
@ -118,7 +120,6 @@ static int facedetect_one_thread(bool useCPU, double scale )
            else
                resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR);
            work_end = 0;
            if(useCPU)
                detectCPU(frameCopy, faces, cpu_cascade, 1);
            else
@ -132,16 +133,16 @@ static int facedetect_one_thread(bool useCPU, double scale )
    }
    else
    {
-        cout << "In image read" << endl;
+        cout << "In image read " << image.size() << endl;
        vector<Rect> faces;
        vector<Rect> ref_rst;
        double accuracy = 0.;
        detectCPU(image, ref_rst, cpu_cascade, scale);
        work_end = 0;
        cout << "loops: ";
        for(int i = 0; i <= LOOP_NUM; i ++)
        {
-            cout << "loop" << i << endl;
+            cout << i << ", ";
            if(useCPU)
                detectCPU(image, faces, cpu_cascade, scale);
            else
@ -152,16 +153,15 @@ static int facedetect_one_thread(bool useCPU, double scale )
                    accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
                }
            }
            if (i == LOOP_NUM)
            {
                if (useCPU)
                    cout << "average CPU time (noCamera) : ";
                else
                    cout << "average GPU time (noCamera) : ";
                cout << getTime() / LOOP_NUM << " ms" << endl;
                cout << "accuracy value: " << accuracy <<endl;
            }
        }
        cout << "done!" << endl;
        if (useCPU)
            cout << "average CPU time (noCamera) : ";
        else
            cout << "average GPU time (noCamera) : ";
        cout << getTotalTime() / LOOP_NUM << " ms" << endl;
        cout << "accuracy value: " << accuracy <<endl;
        Draw(image, faces, scale);
        waitKey(0);
    }
@ -174,9 +174,7 @@ static int facedetect_one_thread(bool useCPU, double scale )
 ///////////////////////////////////////detectfaces with multithreading////////////////////////////////////////////
 #if defined(_MSC_VER) && (_MSC_VER >= 1700)
-#define MAX_THREADS 10
+static void detectFaces(std::string fileName, int threadNum)
 static void detectFaces(std::string fileName)
 {
    ocl::OclCascadeClassifier cascade;
    if(!cascade.load(cascadeName))
@ -188,7 +186,7 @@ static void detectFaces(std::string fileName)
    Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR);
    if (img.empty())
    {
-        std::cout << "cann't open file " + fileName <<std::endl;
+        std::cout << '[' << threadNum << "] " << "can't open file " + fileName <<std::endl;
        return;
    }
@ -196,23 +194,37 @@ static void detectFaces(std::string fileName)
    d_img.upload(img);
    std::vector<Rect> oclfaces;
-    cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0 | CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0));
+    std::thread::id tid = std::this_thread::get_id();
    std::cout << '[' << threadNum << "] "
        << "ThreadID = " << tid
        << ", CommandQueue = " << *(void**)ocl::getClCommandQueuePtr()
        << endl;
    for(int i = 0; i <= LOOP_NUM; i++)
    {
        if(i>0) workBegin(threadNum);
        cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0|CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0));
        if(i>0) workEnd(threadNum);
    }
    std::cout << '[' << threadNum << "] " << "Average time = " << getTotalTime(threadNum) / LOOP_NUM << " ms" << endl;
    for(unsigned int i = 0; i<oclfaces.size(); i++)
        rectangle(img, Point(oclfaces[i].x, oclfaces[i].y), Point(oclfaces[i].x + oclfaces[i].width, oclfaces[i].y + oclfaces[i].height), colors[i%8], 3);
    std::string::size_type pos = outputName.rfind('.');
-    std::string outputNameTid = outputName + '-' + std::to_string(_threadid);
+    std::string strTid = std::to_string(_threadid);
-    if(pos == std::string::npos)
+    if( !outputName.empty() )
    {
-        std::cout << "Invalid output file name: " << outputName << std::endl;
+        if(pos == std::string::npos)
        {
            std::cout << "Invalid output file name: " << outputName << std::endl;
        }
        else
        {
            std::string outputNameTid = outputName.substr(0, pos) + "_" + strTid + outputName.substr(pos);
            imwrite(outputNameTid, img);
        }
    }
-    else
+    imshow(strTid, img);
    {
        outputNameTid = outputName.substr(0, pos) + "_" + std::to_string(_threadid) + outputName.substr(pos);
        imwrite(outputNameTid, img);
    }
    imshow(outputNameTid, img);
    waitKey(0);
 }
@ -221,7 +233,7 @@ static void facedetect_multithreading(int nthreads)
    int thread_number = MAX_THREADS < nthreads ? MAX_THREADS : nthreads;
    std::vector<std::thread> threads;
    for(int i = 0; i<thread_number; i++)
-        threads.push_back(std::thread(detectFaces, inputName));
+        threads.push_back(std::thread(detectFaces, inputName, i));
    for(int i = 0; i<thread_number; i++)
        threads[i].join();
 }
@ -237,8 +249,7 @@ int main( int argc, const char** argv )
        " specify template file path }"
        "{ c scale      |   1.0       | scale image }"
        "{ s use_cpu    | false       | use cpu or gpu to process the image }"
-        "{ o output     | facedetect_output.jpg  |"
+        "{ o output     | | specify output image save path(only works when input is images) }"
        " specify output image save path(only works when input is images) }"
        "{ n thread_num |      1      | set number of threads >= 1 }";
    CommandLineParser cmd(argc, argv, keys);
@ -312,8 +323,6 @@ void detectCPU( Mat& img, vector<Rect>& faces,
 void Draw(Mat& img, vector<Rect>& faces, double scale)
 {
    int i = 0;
    putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50),
            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
    for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
    {
        Point center;
@ -324,8 +333,8 @@ void Draw(Mat& img, vector<Rect>& faces, double scale)
        radius = cvRound((r->width + r->height)*0.25*scale);
        circle( img, center, radius, color, 3, 8, 0 );
    }
-    //imwrite( outputName, img );
+    //if( !outputName.empty() ) imwrite( outputName, img );
-    if(abs(scale-1.0)>.001)
+    if( abs(scale-1.0)>.001 )
    {
        resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale)));
    }